Source code for buzzard._dataset

""">>> help(buzz.Dataset)"""

# pylint: disable=too-many-lines
import sys
import pathlib
import itertools
from types import MappingProxyType
import os

from osgeo import osr
import numpy as np

from buzzard._tools import conv, deprecation_pool
from buzzard._tools import GDALErrorCatcher as Catch

from buzzard._footprint import Footprint
from buzzard import _tools
from buzzard._dataset_back import BackDataset
from buzzard._a_source import ASource
from buzzard._gdal_file_raster import GDALFileRaster, BackGDALFileRaster
from buzzard._gdal_file_vector import GDALFileVector, BackGDALFileVector
from buzzard._gdal_mem_raster import GDALMemRaster
from buzzard._gdal_memory_vector import GDALMemoryVector
from buzzard._dataset_register import DatasetRegisterMixin
from buzzard._numpy_raster import NumpyRaster
from buzzard._cached_raster_recipe import CachedRasterRecipe
from buzzard._a_pooled_emissary import APooledEmissary
import buzzard.utils

[docs]class Dataset(DatasetRegisterMixin): """**Dataset** is a class that stores references to sources. A source is either a raster, or a vector. A `Dataset` allows: + quick manipulations by optionally assigning a key to each registered source \ (see :ref:`Sources Registering` below), + closing all source at once by closing the Dataset object. But also inter-sources operations, like: + limiting maximum number of file descriptors, + spatial reference harmonization (see :ref:`On the fly re-projections in buzzard` below), + workload scheduling on pools when using async rasters (see :ref:`Scheduler` below), + other features in the future (like data visualization). For actions specific to opened sources, see those classes: - :doc:`source_gdal_file_raster` - :doc:`source_gdal_mem_raster` - :doc:`source_numpy_raster` - :doc:`source_cached_raster_recipe` - :doc:`source_gdal_file_vector` - :doc:`source_gdal_memory_vector` .. warning:: This class is not equivalent to the `gdal.Dataset` class. Parameters ---------- sr_work: None or string In order to set a spatial reference, use a string that can be `converted to WKT by GDAL <>`_. (see :ref:`On the fly re-projections in buzzard` below) sr_fallback: None or string In order to set a spatial reference, use a string that can be `converted to WKT by GDAL <>`_. (see :ref:`On the fly re-projections in buzzard` below) sr_forced: None or string In order to set a spatial reference, use a string that can be `converted to WKT by GDAL <>`_. (see :ref:`On the fly re-projections in buzzard` below) analyse_transformation: bool Whether or not to perform a basic analysis on two `sr` to check their compatibility. if True: Read the `buzz.env.significant` variable and raise an exception if a spatial reference conversions is too lossy in precision. if False: Skip all checks. (see :ref:`On the fly re-projections in buzzard` below) allow_none_geometry: bool Whether or not a vector geometry should raise an exception when encountering a None geometry allow_interpolation: bool Whether or not a raster geometry should raise an exception when remapping with interpolation is necessary. max_active: nbr >= 1 Maximum number of pooled sources active at the same time. (see :ref:`Sources activation / deactivation` below) debug_observers: sequence of object Entry points to observe what is happening in the Dataset's sheduler. Examples -------- >>> import buzzard as buzz Creating a Dataset. >>> ds = buzz.Dataset() Opening a file and registering it under the 'roofs' key. There are four ways to the access this source. >>> r = ds.open_vector('roofs', 'path/to/roofs.shp') ... feature_count = len(ds.roofs) ... feature_count = len(ds['roofs']) ... feature_count = len(ds.get('roofs')) ... feature_count = len(r) Opening a file anonymously. There is only one way to access that source. >>> r = ds.aopen_raster('path/to/dem.tif') ... data_type = r.dtype Opening, reading and closing raster files with context management. >>> with ds.open_raster('rgb', 'path/to/rgb.tif').close: ... data_type = ds.rgb.fp ... arr = ds.rgb.get_data() >>> with ds.aopen_raster('path/to/rgb.tif').close as rgb: ... data_type = rgb.dtype ... arr = rgb.get_data() Creating files >>> ds.create_vector('targets', 'path/to/targets.geojson', 'point', driver='GeoJSON') ... geometry_type = ds.targets.type >>> with ds.acreate_raster('/tmp/cache.tif', ds.dem.fp, 'float32', 1).delete as cache: ... file_footprint = cache.fp ... cache.set_data(dem.get_data()) Sources Types ------------- - Raster sources - GDAL drivers (e.g. 'GTIff', 'JPEG', 'PNG', ...) - numpy.ndarray - recipes - Vector sources - OGR drivers: (e.g. 'ESRI Shapefile', 'GeoJSON', 'DXF', ...) .. _Sources Registering: Sources Registering ------------------- There are always two ways to create a source, with a key or anonymously. When creating a source using a key, said key (e.g. the string "my_source_name") must be provided by user. Each key identify one source and should thus be unique. There are then four ways to access that source: - using object returned by the method that created the source, - from the Dataset using the attribute syntax: `ds.my_source_name`, - from the Dataset using the item syntax: ds["my_source_name"], - from the Dataset using the get method: ds.get("my_source_name"). All keys should be unique. When creating a source anonymously you don't have to provide a key, but the only way to access this source is to use the object returned by the method that created the source. .. _Sources activation / deactivation: Sources activation / deactivation --------------------------------- The sources that inherit from `APooledEmissary` (like `GDALFileVector` and `GDALFileRaster`) are flexible about their underlying driver object. Those sources may be temporary deactivated (useful to limit the number of file descriptors active), or activated multiple time at the same time (useful to perfom concurrent reads). Those sources are automatically activated and deactivated given the current needs and constraints. Setting a `max_active` lower than `np.inf` in the Dataset constructor will ensure that no more than `max_active` driver objects are active at the same time, by deactivating the LRU ones. .. _On the fly re-projections in buzzard: On the fly re-projections in buzzard ------------------------------------ A Dataset may perform spatial reference conversions on the fly, like a GIS does. Several modes are available, a set of rules define how each mode work. Those conversions concern both read operations and write operations, all are performed by the OSR library. Those conversions are only perfomed on vector's data/metadata and raster's Footprints. This implies that classic raster warping is not included (yet) in those conversions, only raster shifting/scaling/rotation work. The `z` coordinates of vectors geometries are also converted, on the other hand elevations are not converted in DEM rasters. If `analyse_transformation` is set to `True` (default), all coordinates conversions are tested against `buzz.env.significant` on file opening to ensure their feasibility or raise an exception otherwise. This system is naive and very restrictive, use with caution. Although, disabling those tests is not recommended, ignoring floating point precision errors can create unpredictable behaviors at the pixel level deep in your code. Those bugs can be witnessed when zooming to infinity with tools like `qgis` or `matplotlib`. On the fly re-projections in buzzard - Terminology -------------------------------------------------- `sr` Spatial reference `sr_work` The sr of all interactions with a Dataset (i.e. Footprints, extents, Polygons...), may be None. `sr_stored` The sr that can be found in the metadata of a raster/vector storage, may be None. `sr_virtual` The sr considered to be written in the metadata of a raster/vector storage, it is often the same as `sr_stored`. When a raster/vector is read, a conversion is performed from `sr_virtual` to `sr_work`. When writing vector data, a conversion is performed from `sr_work` to `sr_virtual`. `sr_forced` A `sr_virtual` provided by user to ignore all `sr_stored`. This is for example useful when the `sr` stored in the input files are corrupted. `sr_fallback` A `sr_virtual` provided by user to be used when `sr_stored` is missing. This is for example useful when an input file can't store a `sr` (e.g. DFX). On the fly re-projections in buzzard - Dataset parameters and modes -------------------------------------------------------------------- +------+----------+--------------+------------+-------------------------------------------------------------------------------+ | mode | sr_work | sr_fallback | sr_forced | How is the `sr_virtual` of a source determined | +======+==========+==============+============+===============================================================================+ | 1 | None | None | None | Use `sr_stored`, no conversion is performed for the lifetime of this Dataset | +------+----------+--------------+------------+-------------------------------------------------------------------------------+ | 2 | string | None | None | Use `sr_stored`, if None raises an exception | +------+----------+--------------+------------+-------------------------------------------------------------------------------+ | 3 | string | string | None | Use `sr_stored`, if None it is considered to be `sr_fallback` | +------+----------+--------------+------------+-------------------------------------------------------------------------------+ | 4 | string | None | string | Use `sr_forced` | +------+----------+--------------+------------+-------------------------------------------------------------------------------+ On the fly re-projections in buzzard - Use cases ------------------------------------------------ - If all opened files are known to be written in a same sr in advance, use `mode 1`. No conversions will be performed, this is the safest way to work. - If all opened files are known to be written in the same sr but you wish to work in a \ different sr, use `mode 4`. The huge benefit of this mode is that the `driver` specific behaviors concerning spatial references have no impacts on the data you manipulate. - On the other hand if you don't have a priori information on files' `sr`, `mode 2` or \ `mode 3` should be used. .. warning:: Side note: Since the GeoJSON driver cannot store a `sr`, it is impossible to open or create a GeoJSON file in `mode 2`. On the fly re-projections in buzzard - Examples ----------------------------------------------- mode 1 - No conversions at all >>> ds = buzz.Dataset() mode 2 - Working with WGS84 coordinates >>> ds = buzz.Dataset( ... sr_work='WGS84', ... ) mode 3 - Working in UTM with DXF files in WGS84 coordinates >>> ds = buzz.Dataset( ... sr_work='EPSG:32632', ... sr_fallback='WGS84', ... ) mode 4 - Working in UTM with unreliable LCC input files >>> ds = buzz.Dataset( ... sr_work='EPSG:32632', ... sr_forced='EPSG:27561', .. ) .. _Scheduler: Scheduler --------- To handle *async rasters* living in a Dataset, a thread is spawned to manage requests made to those rasters. It will start as soon as you create an *async raster* and stop when the Dataset is closed or collected. If one of your callback to be called by the scheduler raises an exception, the scheduler will stop and the exception will be propagated to the main thread as soon as possible. Thread-safety ------------- Thread safety is one of the main concern of buzzard. Everything is thread-safe except: - The raster write methods - The vector write methods - The raster read methods when using the `GDAL::MEM` driver - The vector read methods when using the `GDAL::Memory` driver Parallel reads of rasters and vectors are natively supported in buzzard. """ def __init__(self, sr_work=None, sr_fallback=None, sr_forced=None, analyse_transformation=True, allow_none_geometry=False, allow_interpolation=False, max_active=np.inf, debug_observers=(), **kwargs): sr_fallback, kwargs = deprecation_pool.handle_param_renaming_with_kwargs( new_name='sr_fallback', old_names={'sr_implicit': '0.4.4'}, context='Dataset.__init__', new_name_value=sr_fallback, new_name_is_provided=sr_fallback is not None, user_kwargs=kwargs, ) sr_forced, kwargs = deprecation_pool.handle_param_renaming_with_kwargs( new_name='sr_forced', old_names={'sr_origin': '0.4.4'}, context='Dataset.__init__', new_name_value=sr_forced, new_name_is_provided=sr_forced is not None, user_kwargs=kwargs, ) max_active, kwargs = deprecation_pool.handle_param_renaming_with_kwargs( new_name='max_active', old_names={'max_activated': '0.5.0'}, context='Dataset.__init__', new_name_value=max_active, new_name_is_provided=max_active != np.inf, user_kwargs=kwargs, ) if kwargs: # pragma: no cover raise TypeError("__init__() got an unexpected keyword argument '{}'".format( list(kwargs.keys())[0] )) mode = (sr_work is not None, sr_fallback is not None, sr_forced is not None) wkt_work, wkt_fallback, wkt_forced = None, None, None if mode == (False, False, False): pass elif mode == (True, False, False): success, payload = Catch(osr.GetUserInputAsWKT, nonzero_int_is_error=True)(sr_work) if not success: raise ValueError('Could not transform `sr_work` to `wkt` (gdal error: `{}`)'.format( payload[1] )) wkt_work = payload elif mode == (True, True, False): success, payload = Catch(osr.GetUserInputAsWKT, nonzero_int_is_error=True)(sr_work) if not success: raise ValueError('Could not transform `sr_work` to `wkt` (gdal error: `{}`)'.format( payload[1] )) wkt_work = payload success, payload = Catch(osr.GetUserInputAsWKT, nonzero_int_is_error=True)(sr_fallback) if not success: raise ValueError('Could not transform `sr_fallback` to `wkt` (gdal error: `{}`)'.format( payload[1] )) wkt_fallback = payload elif mode == (True, False, True): success, payload = Catch(osr.GetUserInputAsWKT, nonzero_int_is_error=True)(sr_work) if not success: raise ValueError('Could not transform `sr_work` to `wkt` (gdal error: `{}`)'.format( payload[1] )) wkt_work = payload success, payload = Catch(osr.GetUserInputAsWKT, nonzero_int_is_error=True)(sr_forced) if not success: raise ValueError('Could not transform `sr_forced` to `wkt` (gdal error: `{}`)'.format( payload[1] )) wkt_forced = payload else: raise ValueError('Bad combination of `sr_*` parameters') # pragma: no cover del sr_work, sr_fallback, sr_forced if max_active < 1: # pragma: no cover raise ValueError('`max_active` should be greater than 1') allow_interpolation = bool(allow_interpolation) allow_none_geometry = bool(allow_none_geometry) analyse_transformation = bool(analyse_transformation) self._ds_closed = False self._back = BackDataset( wkt_work=wkt_work, wkt_fallback=wkt_fallback, wkt_forced=wkt_forced, analyse_transformation=analyse_transformation, allow_none_geometry=allow_none_geometry, allow_interpolation=allow_interpolation, max_active=max_active, ds_id=id(self), debug_observers=debug_observers, ) super().__init__() # Raster entry points *********************************************************************** **
[docs] def open_raster(self, key, path, driver='GTiff', options=(), mode='r'): """Open a raster file within this Dataset under `key`. Only metadata are kept in memory. >>> help(GDALFileRaster) Parameters ---------- key: hashable (like a string) File identifier within Dataset To avoid using a `key`, you may use :py:meth:`aopen_raster` path: string .. driver: string gdal driver to use when opening the file options: sequence of str options for gdal mode: one of {'r', 'w'} .. Returns ------- source: GDALFileRaster .. Example ------- >>> ds.open_raster('ortho', '/path/to/ortho.tif') >>> file_proj4 = ds.ortho.proj4_stored >>> ds.open_raster('dem', '/path/to/dem.tif', mode='w') >>> nodata_value = ds.dem.nodata See Also -------- - :py:meth:`Dataset.aopen_raster`: To skip the `key` assigment - :py:func:`buzzard.open_raster`: To skip the `key` assigment and the explicit `Dataset` instanciation """ # Parameter checking *************************************************** path = str(path) driver = str(driver) options = [str(arg) for arg in options] _ = conv.of_of_mode(mode) # Construction dispatch ************************************************ if driver.lower() == 'mem': # pragma: no cover raise ValueError("Can't open a MEM raster, user create_raster") elif True: allocator = lambda: BackGDALFileRaster.open_file( path, driver, options, mode ) prox = GDALFileRaster(self, allocator, options, mode) else: pass # Dataset Registering *********************************************** if not isinstance(key, _AnonymousSentry): self._register([key], prox) else: self._register([], prox) return prox
[docs] def aopen_raster(self, path, driver='GTiff', options=(), mode='r'): """Open a raster file anonymously within this Dataset. Only metadata are kept in memory. See :py:meth:`~Dataset.open_raster` Example ------ >>> ortho = ds.aopen_raster('/path/to/ortho.tif') >>> file_wkt = ortho.wkt_stored See Also -------- - :py:meth:`Dataset.open_raster`: To assign a `key` to this source within the `Dataset` - :py:func:`buzzard.open_raster`: To skip the explicit `Dataset` instanciation """ return self.open_raster(_AnonymousSentry(), path, driver, options, mode)
[docs] def create_raster(self, key, path, fp, dtype, channel_count, channels_schema=None, driver='GTiff', options=(), sr=None, ow=False, **kwargs): """Create a raster file and register it under `key` within this Dataset. Only metadata are kept in memory. The raster's values are initialized with `channels_schema['nodata']` or `0`. >>> help(GDALFileRaster) >>> help(GDALMemRaster) Parameters ---------- key: hashable (like a string) File identifier within Dataset To avoid using a `key`, you may use :py:meth:`acreate_raster` path: string Anything that makes sense to GDAL: + A path to a file + An empty string when using `driver=MEM` + A path or an xml string when using `driver=VRT` fp: Footprint Description of the location and size of the raster to create. dtype: numpy type (or any alias) .. channel_count: integer number of channels channels_schema: dict or None Channel(s) metadata. (see `Channels schema fields` below) driver: string gdal driver to use when opening the file options: sequence of str options for gdal sr: string or None Spatial reference of the new file. In order not to set a spatial reference, use `None`. In order to set a spatial reference, use a string that can be `converted to WKT by GDAL <>`_. ow: bool Overwrite. Whether or not to erase the existing files. Returns ------- source: GDALFileRaster or GDALMemRaster The type depends on the `driver` parameter Example ------- >>> ds.create_raster('dem_copy', 'dem_copy.tif', ds.dem.fp, ds.dsm.dtype, len(ds.dem)) >>> array = ds.dem.get_data() >>> ds.dem_copy.set_data(array) Channel schema fields --------------------- Fields: 'nodata': None or number 'interpretation': None or str 'offset': None or number 'scale': None or number 'mask': None or str Interpretation values: undefined, grayindex, paletteindex, redband, greenband, blueband, alphaband, hueband, saturationband, lightnessband, cyanband, magentaband, yellowband, blackband Mask values: all_valid, per_dataset, alpha, nodata Additionally: - A field missing or None is kept to default value. - A field can be passed as - a value: All bands are set to this value - a sequence of values of length `channel_count`: All bands will be set to their respective state Caveat ------ When using the GTiff driver, specifying a `mask` or `interpretation` field may lead to unexpected results. See Also -------- - :py:meth:`Dataset.acreate_raster`: To skip the `key` assigment - :py:func:`buzzard.create_raster`: To skip the `key` assigment and the explicit `Dataset` instanciation """ # Deprecated parameters ************************************************ channels_schema, kwargs = deprecation_pool.handle_param_renaming_with_kwargs( new_name='channels_schema', old_names={'band_schema': '0.6.0'}, context='Dataset.create_raster', new_name_value=channels_schema, new_name_is_provided=channels_schema is not None, user_kwargs=kwargs, ) if kwargs: # pragma: no cover raise TypeError("create_raster() got an unexpected keyword argument '{}'".format( list(kwargs.keys())[0] )) # Parameter checking *************************************************** ow = bool(ow) path = str(path) if not isinstance(fp, Footprint): # pragma: no cover raise TypeError('`fp` should be a Footprint') dtype = np.dtype(dtype) channel_count = int(channel_count) if channel_count <= 0: raise ValueError('`channel_count` should be >0') channels_schema = _tools.sanitize_channels_schema(channels_schema, channel_count) driver = str(driver) options = [str(arg) for arg in options] if sr is not None: success, payload = Catch(osr.GetUserInputAsWKT, nonzero_int_is_error=True)(sr) if not success: raise ValueError('Could not transform `sr` to `wkt` (gdal error: `{}`)'.format( payload[1] )) wkt = payload else: wkt = None del sr if wkt is not None: fp = self._back.convert_footprint(fp, wkt) # Construction dispatch ************************************************ if driver.lower() == 'mem': # TODO for 0.5.0: Check async_ is False prox = GDALMemRaster( self, fp, dtype, channel_count, channels_schema, options, wkt, ) elif True: allocator = lambda: BackGDALFileRaster.create_file( path, fp, dtype, channel_count, channels_schema, driver, options, wkt, ow, ) prox = GDALFileRaster(self, allocator, options, 'w') else: pass # Dataset Registering *********************************************** if not isinstance(key, _AnonymousSentry): self._register([key], prox) else: self._register([], prox) return prox
[docs] def acreate_raster(self, path, fp, dtype, channel_count, channels_schema=None, driver='GTiff', options=(), sr=None, ow=False, **kwargs): """Create a raster file anonymously within this Dataset. Only metadata are kept in memory. See :py:meth:`~Dataset.create_raster` Example ------- >>> mask = ds.acreate_raster('mask.tif', ds.dem.fp, bool, 1, options=['SPARSE_OK=YES']) >>> open_options = mask.open_options >>> channels_schema = { ... 'nodata': -32767, ... 'interpretation': ['blackband', 'cyanband'], ... } >>> out = ds.acreate_raster('output.tif', ds.dem.fp, 'float32', 2, channels_schema) >>> band_interpretation = out.channels_schema['interpretation'] See Also -------- - :py:meth:`Dataset.create_raster`: To assign a `key` to this source within the `Dataset` - :py:func:`buzzard.create_raster`: To skip the explicit `Dataset` instanciation """ return self.create_raster(_AnonymousSentry(), path, fp, dtype, channel_count, channels_schema, driver, options, sr, ow, **kwargs)
[docs] def wrap_numpy_raster(self, key, fp, array, channels_schema=None, sr=None, mode='w', **kwargs): """Register a numpy array as a raster under `key` within this Dataset. >>> help(NumpyRaster) Parameters ---------- key: hashable (like a string) File identifier within Dataset To avoid using a `key`, you may use :py:meth:`awrap_numpy_raster` fp: Footprint of shape (Y, X) Description of the location and size of the raster to create. array: ndarray of shape (Y, X) or (Y, X, C) .. channels_schema: dict or None Channel(s) metadata. (see `Channels schema fields` below) sr: string or None Spatial reference of the new file In order not to set a spatial reference, use `None`. In order to set a spatial reference, use a string that can be `converted to WKT by GDAL <>`_. Returns ------- source: NumpyRaster .. Channel schema fields --------------------- Fields: 'nodata': None or number 'interpretation': None or str 'offset': None or number 'scale': None or number 'mask': None or str Interpretation values: undefined, grayindex, paletteindex, redband, greenband, blueband, alphaband, hueband, saturationband, lightnessband, cyanband, magentaband, yellowband, blackband Mask values: all_valid, per_dataset, alpha, nodata Additionally: - A field missing or None is kept to default value. - A field can be passed as - a value: All bands are set to this value - a sequence of values of length `channel_count`: All bands will be set to their respective state See Also -------- - :py:meth:`Dataset.awrap_numpy_raster`: To skip the `key` assigment - :py:meth:`buzzard.wrap_numpy_raster`: To skip the `key` assigment and the explicit `Dataset` instanciation """ # Deprecated parameters ************************************************ channels_schema, kwargs = deprecation_pool.handle_param_renaming_with_kwargs( new_name='channels_schema', old_names={'band_schema': '0.6.0'}, context='Dataset.wrap_numpy_raster', new_name_value=channels_schema, new_name_is_provided=channels_schema is not None, user_kwargs=kwargs, ) if kwargs: # pragma: no cover raise TypeError("wrap_numpy_raster() got an unexpected keyword argument '{}'".format( list(kwargs.keys())[0] )) # Parameter checking *************************************************** if not isinstance(fp, Footprint): # pragma: no cover raise TypeError('`fp` should be a Footprint') array = np.asarray(array) if array.shape[:2] != tuple(fp.shape): # pragma: no cover raise ValueError('Incompatible shape between `array` and `fp`') if array.ndim not in [2, 3]: # pragma: no cover raise ValueError('Array should have 2 or 3 dimensions') channel_count = 1 if array.ndim == 2 else array.shape[-1] channels_schema = _tools.sanitize_channels_schema(channels_schema, channel_count) if sr is not None: success, payload = Catch(osr.GetUserInputAsWKT, nonzero_int_is_error=True)(sr) if not success: raise ValueError('Could not transform `sr` to `wkt` (gdal error: `{}`)'.format( payload[1] )) wkt = payload else: wkt = None del sr _ = conv.of_of_mode(mode) if wkt is not None: fp = self._back.convert_footprint(fp, wkt) # Construction ********************************************************* prox = NumpyRaster(self, fp, array, channels_schema, wkt, mode) # Dataset Registering *********************************************** if not isinstance(key, _AnonymousSentry): self._register([key], prox) else: self._register([], prox) return prox
[docs] def awrap_numpy_raster(self, fp, array, channels_schema=None, sr=None, mode='w', **kwargs): """Register a numpy array as a raster anonymously within this Dataset. See Also -------- - :py:meth:`Dataset.wrap_numpy_raster`: To assign a `key` to this source within the `Dataset` - :py:meth:`buzzard.wrap_numpy_raster`: To skip the `key` assigment and the explicit `Dataset` instanciation """ return self.wrap_numpy_raster( _AnonymousSentry(), fp, array, channels_schema, sr, mode, **kwargs )
[docs] def create_raster_recipe( self, key, # raster attributes fp, dtype, channel_count, channels_schema=None, sr=None, # callbacks running on pool compute_array=None, merge_arrays=buzzard.utils.concat_arrays, # primitives queue_data_per_primitive=MappingProxyType({}), convert_footprint_per_primitive=None, # pools computation_pool='cpu', merge_pool='cpu', resample_pool='cpu', # misc computation_tiles=None, max_computation_size=None, max_resampling_size=None, automatic_remapping=True, debug_observers=(), ): """ .. warning:: This method is not yet implemented. It exists for documentation purposes. Create a *raster recipe* and register it under `key` within this Dataset. A *raster recipe* implements the same interfaces as all other rasters, but internally it computes data on the fly by calling a callback. The main goal of the *raster recipes* is to provide a boilerplate-free interface that automatize those cumbersome tasks: - tiling, - parallelism - caching - file reads - resampling - lazy evaluation - backpressure prevention and - optimised task scheduling. If you are familiar with `create_cached_raster_recipe` two parameters are new here: `automatic_remapping` and `max_computation_size`. Parameters ---------- key: see :py:meth:`Dataset.create_raster` fp: see :py:meth:`Dataset.create_raster` dtype: see :py:meth:`Dataset.create_raster` channel_count: see :py:meth:`Dataset.create_raster` channels_schema: see :py:meth:`Dataset.create_raster` sr: see :py:meth:`Dataset.create_raster` compute_array: callable see :ref:`Computation Function` below merge_arrays: callable see :ref:`Merge Function` below queue_data_per_primitive: dict of hashable (like a string) to a `queue_data` method pointer see :ref:`Primitives` below convert_footprint_per_primitive: None or dict of hashable (like a string) to a callable see :ref:`Primitives` below computation_pool: see :ref:`Pools` below merge_pool: see :ref:`Pools` below resample_pool: see :ref:`Pools` below computation_tiles: None or (int, int) or numpy.ndarray of Footprint see :ref:`Computation Tiling` below max_computation_size: None or int or (int, int) see :ref:`Computation Tiling` below max_resampling_size: None or int or (int, int) Optionally define a maximum resampling size. If a larger resampling has to be performed, it will be performed tile by tile in parallel. automatic_remapping: bool see :ref:`Automatic Remapping` below debug_observers: sequence of object Entry points that observe what is happening with this raster in the Dataset's scheduler. Returns ------- source: NocacheRasterRecipe .. .. _Computation Function: Computation Function -------------------- The function that will map a Footprint to a numpy.ndarray. If `queue_data_per_primitive` is not empty, it will map a Footprint and primitive arrays to a numpy.ndarray. It will be called in parallel according to the `computation_pool` parameter provided at construction. The function will be called with the following positional parameters: - fp: Footprint of shape (Y, X) The location at which the pixels should be computed - primitive_fps: dict of hashable to Footprint For each primitive defined through the `queue_data_per_primitive` parameter, the input Footprint. - primitive_arrays: dict of hashable to numpy.ndarray For each primitive defined through the `queue_data_per_primitive` parameter, the input numpy.ndarray that was automatically computed. - raster: CachedRasterRecipe or None The Raster object of the ongoing computation. It should return either: - a single ndarray of shape (Y, X) if only one channel was computed .. - a single ndarray of shape (Y, X, C) if one or more channels were computed .. If `computation_pool` points to a process pool, the `compute_array` function must be picklable and the `raster` parameter will be None. .. _Computation Tiling: Computation Tiling ------------------ You may sometimes want to have control on the Footprints that are requested to the `compute_array` function, for example: - If pixels computed by `compute_array` are long to compute, you want to tile to increase parallelism. - If the `compute_array` function scales badly in term of memory or time, you want to tile to reduce complexity. - If `compute_array` can work only on certain Footprints, you want a hard constraint on the set of Footprint that can be queried from `compute_array`. (This may happen with *convolutional neural networks*) To do so use the `computation_tiles` or `max_computation_size` parameter (not both). If `max_computation_size` is provided, a Footprint to be computed will be tiled given this parameter. If `computation_tiles` is a numpy.ndarray of Footprint, it should be a tiling of the `fp` parameter. Only the Footprints contained in this tiling will be asked to the `computation_tiles`. If `computation_tiles` is (int, int), a tiling will be constructed using Footprint.tile using those two ints. .. _Merge Function: Merge Function -------------- The function that will map several pairs of Footprint/numpy.ndarray to a single numpy.ndarray. If the `computation_tiles` is None, it will never be called. It will be called in parallel according to the `merge_pool` parameter provided at construction. The function will be called with the following positional parameters: - fp: Footprint of shape (Y, X) The location at which the pixels should be computed. - array_per_fp: dict of Footprint to numpy.ndarray The pairs of Footprint/numpy.ndarray of each arrays that were computed by `compute_array` and that overlap with `fp`. - raster: CachedRasterRecipe or None The Raster object of the ongoing computation. It should return either: - a single ndarray of shape (Y, X) if only one channel was computed .. - a single ndarray of shape (Y, X, C) if one or more channels were computed .. If `merge_pool` points to a process pool, the `merge_array` function must be picklable and the `raster` parameter will be None. .. _Automatic Remapping: Automatic Remapping ------------------- When creating a recipe you give a *Footprint* through the `fp` parameter. When calling your `compute_array` function the scheduler will only ask for slices of `fp`. This means that the scheduler takes care of those boilerplate steps: - If you request a *Footprint* on a different grid in a `get_data()` call, the scheduler **takes care of resampling** the outputs of your `compute*array` function. - If you request a *Footprint* partially or fully outside of the raster's extent, the scheduler will call your `compute_array` function to get the interior pixels and then **pad the output with nodata**. This system is flexible and can be deactivated by passing `automatic_remapping=False` to the constructor of a *NocacheRasterRecipe*, in this case the scheduler will call your `compute_array` function for any kind of *Footprint*; thus your function must be able to comply with any request. .. _Primitives: Primitives ---------- The `queue_data_per_primitive` and `convert_footprint_per_primitive` parameters can be used to create dependencies between `dependee` *async rasters* and the *raster recipe* being created. The dependee/dependent relation is called primitive/derived throughout buzzard. A derived recipe can itself be the primitive of another raster. Pipelines of any depth and width can be instanciated that way. In `queue_data_per_primitive` you declare a `dependee` by giving it a key of your choice and the pointer to the `queue_data` method of `dependee` raster. You can parameterize the connection by *currying* the `channels`, `dst_nodata`, `interpolation` and `max_queue_size` parameters using `functools.partial`. The `convert_footprint_per_primitive` dict should contain the same keys as `queue_data_per_primitive`. A value in the dict should be a function that maps a Footprint to another Footprint. It can be used for example to request larger rectangles of primitives data to compute a derived array. e.g. If the primitive raster is an `rgb` image, and the derived raster only needs the green channel but with a context of 10 additional pixels on all 4 sides: >>> derived = ds.create_raster_recipe( ... # <other parameters> ... queue_data_per_primitive={'green': functools.partial(primitive.queue_data, channels=1)}, ... convert_footprint_per_primitive={'green': lambda fp: fp.dilate(10)}, ... ) .. _Pools: Pools ----- The `*_pool` parameters can be used to select where certain computations occur. Those parameters can be of the following types: - A *multiprocessing.pool.ThreadPool*, should be the default choice. - A *multiprocessing.pool.Pool*, a process pool. Useful for computations that requires the GIL or that leaks memory. - `None`, to request the scheduler thread to perform the tasks itself. Should be used when the computation is very light. - A *hashable* (like a *string*), that will map to a pool registered in the *Dataset*. If that key is missing from the *Dataset*, a *ThreadPool* with `multiprocessing.cpu_count()` workers will be automatically instanciated. When the Dataset is closed, the pools instanciated that way will be joined. See Also -------- - :py:meth:`Dataset.create_raster_recipe`: For results `caching` - :py:meth:`Dataset.acreate_cached_raster_recipe`: To skip the `key` assigment """ raise NotImplementedError()
[docs] def create_cached_raster_recipe( self, key, # raster attributes fp, dtype, channel_count, channels_schema=None, sr=None, # callbacks running on pool compute_array=None, merge_arrays=buzzard.utils.concat_arrays, # filesystem cache_dir=None, ow=False, # primitives queue_data_per_primitive=MappingProxyType({}), convert_footprint_per_primitive=None, # pools computation_pool='cpu', merge_pool='cpu', io_pool='io', resample_pool='cpu', # misc cache_tiles=(512, 512), computation_tiles=None, max_resampling_size=None, debug_observers=() ): """Create a *cached raster recipe* and register it under `key` within this Dataset. Compared to a `NocacheRasterRecipe`, in a `CachedRasterRecipe` the pixels are never computed twice. Cache files are used to store and reuse pixels from computations. The cache can even be reused between python sessions. If you are familiar with `create_raster_recipe` four parameters are new here: `io_pool`, `cache_tiles`, `cache_dir` and `ow`. They are all related to file system operations. See `create_raster_recipe` method, since it shares most of the features: >>> help(CachedRasterRecipe) Parameters ---------- key: see :py:meth:`Dataset.create_raster` method fp: see :py:meth:`Dataset.create_raster` method dtype: see :py:meth:`Dataset.create_raster` method channel_count: see :py:meth:`Dataset.create_raster` method channels_schema: see :py:meth:`Dataset.create_raster` method sr: see :py:meth:`Dataset.create_raster` method compute_array: see :py:meth:`Dataset.create_raster_recipe` method merge_arrays: see :py:meth:`Dataset.create_raster_recipe` method cache_dir: str or pathlib.Path Path to the directory that holds the cache files associated with this raster. If cache files are present, they will be reused (or erased if corrupted). If a cache file is needed and missing, it will be computed. ow: bool Overwrite. Whether or not to erase the old cache files contained in `cache_dir`. .. warning:: not only the tiles needed (hence computed) but all buzzard cache files in `cache_dir` will be deleted. queue_data_per_primitive: see :py:meth:`Dataset.create_raster_recipe` method convert_footprint_per_primitive: see :py:meth:`Dataset.create_raster_recipe` method computation_pool: see :py:meth:`Dataset.create_raster_recipe` method merge_pool: see :py:meth:`Dataset.create_raster_recipe` method io_pool: see :py:meth:`Dataset.create_raster_recipe` method resample_pool: see :py:meth:`Dataset.create_raster_recipe` method cache_tiles: (int, int) or numpy.ndarray of Footprint A tiling of the `fp` parameter. Each tile will correspond to one cache file. if (int, int): Construct the tiling by calling Footprint.tile with this parameter computation_tiles: if None: Use the same tiling as `cache_tiles` else: see `create_raster_recipe` method max_resampling_size: None or int or (int, int) see :py:meth:`Dataset.create_raster_recipe` method debug_observers: sequence of object see :py:meth:`Dataset.create_raster_recipe` method Returns ------- source: CachedRasterRecipe .. See Also -------- - :py:meth:`Dataset.create_raster_recipe`: To skip the `caching` - :py:meth:`Dataset.acreate_cached_raster_recipe`: To skip the `key` assigment """ # Parameter checking *************************************************** # Classic RasterSource parameters ******************* if not isinstance(fp, Footprint): # pragma: no cover raise TypeError('`fp` should be a Footprint') dtype = np.dtype(dtype) channel_count = int(channel_count) if channel_count <= 0: raise ValueError('`channel_count` should be >0') channels_schema = _tools.sanitize_channels_schema(channels_schema, channel_count) if sr is not None: success, payload = Catch(osr.GetUserInputAsWKT, nonzero_int_is_error=True)(sr) if not success: raise ValueError('Could not transform `sr` to `wkt` (gdal error: `{}`)'.format( payload[1] )) wkt = payload else: wkt = None del sr if wkt is not None: fp = self._back.convert_footprint(fp, wkt) # Callables **************************************** if compute_array is None: raise ValueError('Missing `compute_array` parameter') if not callable(compute_array): raise TypeError('`compute_array` should be callable') if not callable(merge_arrays): raise TypeError('`merge_arrays` should be callable') # Primitives *************************************** if convert_footprint_per_primitive is None: convert_footprint_per_primitive = { name: (lambda fp: fp) for name in queue_data_per_primitive.keys() } if queue_data_per_primitive.keys() != convert_footprint_per_primitive.keys(): err = 'There should be the same keys in `queue_data_per_primitive` and ' err += '`convert_footprint_per_primitive`.' if queue_data_per_primitive.keys() - convert_footprint_per_primitive.keys(): err += '\n{} are missing from `convert_footprint_per_primitive`.'.format( queue_data_per_primitive.keys() - convert_footprint_per_primitive.keys() ) if convert_footprint_per_primitive.keys() - queue_data_per_primitive.keys(): err += '\n{} are missing from `queue_data_per_primitive`.'.format( convert_footprint_per_primitive.keys() - queue_data_per_primitive.keys() ) raise ValueError(err) primitives_back = {} primitives_kwargs = {} for name, met in queue_data_per_primitive.items(): primitives_back[name], primitives_kwargs[name] = _tools.shatter_queue_data_method(met, name) if primitives_back[name].back_ds is not self._back: raise ValueError('The `{}` primitive comes from another Dataset'.format( name )) for name, func in convert_footprint_per_primitive.items(): if not callable(func): raise TypeError('convert_footprint_per_primitive[{}] should be callable'.format( name )) # Pools ******************************************** computation_pool = self._back.pools_container._normalize_pool_parameter( computation_pool, 'computation_pool' ) merge_pool = self._back.pools_container._normalize_pool_parameter( merge_pool, 'merge_pool' ) io_pool = self._back.pools_container._normalize_pool_parameter( io_pool, 'io_pool' ) resample_pool = self._back.pools_container._normalize_pool_parameter( resample_pool, 'resample_pool' ) # Tilings ****************************************** if isinstance(cache_tiles, np.ndarray) and cache_tiles.dtype == np.object: if not _tools.is_tiling_covering_fp( cache_tiles, fp, allow_outer_pixels=False, allow_overlapping_pixels=False, ): raise ValueError("`cache_tiles` should be a tiling of raster's Footprint, " +\ "without overlap, with `boundary_effect='shrink'`" ) else: # Defer the parameter checking to fp.tile cache_tiles = fp.tile(cache_tiles, 0, 0, boundary_effect='shrink') if computation_tiles is None: computation_tiles = cache_tiles elif isinstance(computation_tiles, np.ndarray) and computation_tiles.dtype == np.object: if not _tools.is_tiling_covering_fp( cache_tiles, fp, allow_outer_pixels=True, allow_overlapping_pixels=True, ): raise ValueError("`computation_tiles` should be a tiling covering raster's Footprint") else: # Defer the parameter checking to fp.tile computation_tiles = fp.tile(computation_tiles, 0, 0, boundary_effect='shrink') # Misc ********************************************* if max_resampling_size is not None: max_resampling_size = int(max_resampling_size) if max_resampling_size <= 0: raise ValueError('`max_resampling_size` should be >0') if cache_dir is None: raise ValueError('Missing `cache_dir` parameter') if not isinstance(cache_dir, (str, pathlib.Path)): raise TypeError('cache_dir should be a string') cache_dir = str(cache_dir) overwrite = bool(ow) del ow # Construction ********************************************************* prox = CachedRasterRecipe( self, fp, dtype, channel_count, channels_schema, wkt, compute_array, merge_arrays, cache_dir, overwrite, primitives_back, primitives_kwargs, convert_footprint_per_primitive, computation_pool, merge_pool, io_pool, resample_pool, cache_tiles, computation_tiles, max_resampling_size, debug_observers, ) # Dataset Registering *********************************************** if not isinstance(key, _AnonymousSentry): self._register([key], prox) else: self._register([], prox) return prox
[docs] def acreate_cached_raster_recipe( self, # raster attributes fp, dtype, channel_count, channels_schema=None, sr=None, # callbacks running on pool compute_array=None, merge_arrays=buzzard.utils.concat_arrays, # filesystem cache_dir=None, ow=False, # primitives queue_data_per_primitive=MappingProxyType({}), convert_footprint_per_primitive=None, # pools computation_pool='cpu', merge_pool='cpu', io_pool='io', resample_pool='cpu', # misc cache_tiles=(512, 512), computation_tiles=None, max_resampling_size=None, debug_observers=() ): """Create a cached raster reciped anonymously within this Dataset. See Dataset.create_cached_raster_recipe See Also -------- - :py:meth:`Dataset.create_raster_recipe`: To skip the `caching` - :py:meth:`Dataset.create_cached_raster_recipe`: To assign a `key` to this source within the `Dataset` """ return self.create_cached_raster_recipe( _AnonymousSentry(), fp, dtype, channel_count, channels_schema, sr, compute_array, merge_arrays, cache_dir, ow, queue_data_per_primitive, convert_footprint_per_primitive, computation_pool, merge_pool, io_pool, resample_pool, cache_tiles, computation_tiles, max_resampling_size, debug_observers, )
# Vector entry points *********************************************************************** **
[docs] def open_vector(self, key, path, layer=None, driver='ESRI Shapefile', options=(), mode='r'): """Open a vector file within this Dataset under `key`. Only metadata are kept in memory. >>> help(GDALFileVector) Parameters ---------- key: hashable (like a string) File identifier within Dataset To avoid using a `key`, you may use :py:meth:`aopen_vector` path: string .. layer: None or int or string .. driver: string ogr driver to use when opening the file options: sequence of str options for ogr mode: one of {'r', 'w'} .. Returns ------- source: GDALFileVector .. Example ------- >>> ds.open_vector('trees', '/path/to.shp') >>> feature_count = len(ds.trees) >>> ds.open_vector('roofs', '/path/to.json', driver='GeoJSON', mode='w') >>> fields_list = ds.roofs.fields See Also -------- - :py:meth:`Dataset.aopen_vector`: To skip the `key` assigment - :py:func:`buzzard.open_vector`: To skip the `key` assigment and the explicit `Dataset` instanciation """ # Parameter checking *************************************************** path = str(path) if layer is None: layer = 0 elif np.all(np.isreal(layer)): layer = int(layer) else: layer = str(layer) driver = str(driver) options = [str(arg) for arg in options] _ = conv.of_of_mode(mode) # Construction dispatch ************************************************ if driver.lower() == 'memory': # pragma: no cover raise ValueError("Can't open a MEMORY vector, user create_vector") elif True: allocator = lambda: BackGDALFileVector.open_file( path, layer, driver, options, mode ) prox = GDALFileVector(self, allocator, options, mode) else: pass # Dataset Registering *********************************************** if not isinstance(key, _AnonymousSentry): self._register([key], prox) else: self._register([], prox) return prox
[docs] def aopen_vector(self, path, layer=None, driver='ESRI Shapefile', options=(), mode='r'): """Open a vector file anonymously within this Dataset. Only metadata are kept in memory. See :py:meth:`~Dataset.open_vector` Example ------- >>> trees = ds.aopen_vector('/path/to.shp') >>> features_bounds = trees.bounds See Also -------- - :py:meth:`Dataset.open_vector`: To assign a `key` to this source within the `Dataset` - :py:func:`buzzard.open_vector`: To skip the `key` assigment and the explicit `Dataset` instanciation """ return self.open_vector(_AnonymousSentry(), path, layer, driver, options, mode)
[docs] def create_vector(self, key, path, type, fields=(), layer=None, driver='ESRI Shapefile', options=(), sr=None, ow=False): r"""Create an empty vector file and register it under `key` within this Dataset. Only metadata are kept in memory. >>> help(GDALFileVector) >>> help(GDALMemoryVector) Parameters ---------- key: hashable (like a string) File identifier within Dataset To avoid using a `key`, you may use :py:meth:`acreate_vector` path: string Anything that makes sense to GDAL: + A path to a file + An empty string when using `driver=Memory` type: string name of a wkb geometry type, without the `wkb` prefix. For example: "Point", "Polygon", "LineString". full list: fields: sequence of dict Attributes of fields, one dict per field. (see :ref:`Field Attributes` below) layer: None or string .. driver: string ogr driver to use when opening the file options: sequence of str options for ogr sr: string or None Spatial reference of the new file In order not to set a spatial reference, use `None`. In order to set a spatial reference, use a string that can be `converted to WKT by GDAL <>`_. ow: bool Overwrite. Whether or not to erase the existing files. Returns ------- source: GDALFileVector or GDALMemoryVector The type depends on the `driver` parameter Example ------- >>> ds.create_vector('lines', '/path/to.shp', 'linestring') >>> geometry_type = ds.lines.type >>> ds.lines.insert_data([[0, 0], [1, 1], [1, 2]]) >>> fields = [ {'name': 'name', 'type': str}, {'name': 'count', 'type': 'int32'}, {'name': 'area', 'type': np.float64, 'width': 5, precision: 18}, {'name': 'when', 'type': np.datetime64}, ] >>> ds.create_vector('zones', '/path/to.shp', 'polygon', fields) >>> field0_type = ds.zones.fields[0]['type'] >>> ds.zones.insert_data(, 10, 15, 15)) .. _Field Attributes: Field Attributes ---------------- Attributes: - "name": string - "type": string (see :ref:`Field Types` below) - "precision": int - "width": int - "nullable": bool - "default": same as `type` An attribute missing or None is kept to default value. .. _Field Types: Field Types ----------- +---------------+------------------------------------------------------------------------+ | Type | Type names | +===============+========================================================================+ | Binary | "binary", bytes, np.bytes\_, aliases of np.bytes\_ | +---------------+------------------------------------------------------------------------+ | Date | "date" | +---------------+------------------------------------------------------------------------+ | DateTime | "datetime", datetime.datetime, np.datetime64, aliases of np.datetime64 | +---------------+------------------------------------------------------------------------+ | Time | "time" | +---------------+------------------------------------------------------------------------+ | Integer | "integer" np.int32, aliases of np.int32 | +---------------+------------------------------------------------------------------------+ | Integer64 | "integer64", int, np.int64, aliases of np.int64 | +---------------+------------------------------------------------------------------------+ | Real | "real", float, np.float64, aliases of np.float64 | +---------------+------------------------------------------------------------------------+ | String | "string", str, np.str\_, aliases of np.str\_ | +---------------+------------------------------------------------------------------------+ | Integer64List | "integer64list", "int list" | +---------------+------------------------------------------------------------------------+ | IntegerList | "integerlist" | +---------------+------------------------------------------------------------------------+ | RealList | "reallist", "float list" | +---------------+------------------------------------------------------------------------+ See Also -------- - :py:meth:`Dataset.acreate_vector`: To skip the `key` assigment - :py:func:`buzzard.create_vector`: To skip the `key` assigment and the explicit `Dataset` instanciation """ type_ = type del type # Parameter checking *************************************************** path = str(path) type_ = conv.str_of_wkbgeom(conv.wkbgeom_of_str(type_)) fields = _tools.normalize_fields_defn(fields) if layer is None: layer = '.'.join(os.path.basename(path).split('.')[:-1]) else: layer = str(layer) driver = str(driver) options = [str(arg) for arg in options] ow = bool(ow) if sr is None: wkt = None else: success, payload = Catch(osr.GetUserInputAsWKT, nonzero_int_is_error=True)(sr) if not success: raise ValueError('Could not transform `sr` to `wkt` (gdal error: `{}`)'.format( payload[1] )) wkt = payload # Construction dispatch ************************************************ if driver.lower() == 'memory': allocator = lambda: BackGDALFileVector.create_file( '', type_, fields, layer, 'Memory', options, wkt, False, ) prox = GDALMemoryVector(self, allocator, options) elif True: allocator = lambda: BackGDALFileVector.create_file( path, type_, fields, layer, driver, options, wkt, ow ) prox = GDALFileVector(self, allocator, options, 'w') else: pass # Dataset Registering *********************************************** if not isinstance(key, _AnonymousSentry): self._register([key], prox) else: self._register([], prox) return prox
[docs] def acreate_vector(self, path, type, fields=(), layer=None, driver='ESRI Shapefile', options=(), sr=None, ow=False): """Create a vector file anonymously within this Dataset. Only metadata are kept in memory. See :py:meth:`~Dataset.create_vector` Example ------- >>> lines = ds.acreate_vector('/path/to.shp', 'linestring') >>> file_proj4 = lines.proj4_stored See Also -------- - :py:meth:`Dataset.create_vector`: To assign a `key` to this source within the `Dataset` - :py:func:`buzzard.create_vector`: To skip the `key` assigment and the explicit `Dataset` instanciation """ return self.create_vector(_AnonymousSentry(), path, type, fields, layer, driver, options, sr, ow)
# Cleanup *********************************************************************************** **
[docs] def __del__(self): if not self._ds_closed: self.close()
@property def close(self): """Close the Dataset with a call or a context management. The `close` attribute returns an object that can be both called and used in a with statement The Dataset can be closed manually or automatically when garbage collected, it is safer to do it manually. The internal steps are: - Stopping the scheduler - Joining the mp.Pool that have been automatically allocated - Closing all sources Examples -------- >>> ds = buzz.Dataset() ... # code... ... ds.close() >>> with buzz.Dataset().close as ds ... # code... Caveat ------ When using a scheduler, some memory leaks may still occur after closing a Dataset. Possible origins: - - Gdal cache not flushed (not a leak) - The gdal version - (not a leak) - Some unknown leak in the python `threading` or `multiprocessing` standard library - Some unknown library leaking memory on the `C` side - Some unknown library storing data in global variables You can use a `debug_observer` with an `on_object_allocated` method to track large objects allocated in the scheduler. It will likely not be the source of the problem. If you even find a source of leaks please contact the buzzard team. """ if self._ds_closed: raise RuntimeError("Dataset already closed") def _close(): if self._ds_closed: raise RuntimeError("Dataset already closed") self._ds_closed = True # Tell scheduler to stop, wait until it is done self._back.stop_scheduler() # Safely release all resources self._back.pools_container._close() for source in list(self._keys_of_source.keys()): source.close() return _CloseRoutine(self, _close) # Source infos ******************************************************************************* **
[docs] def __getitem__(self, key): """Retrieve a source from its key""" return self._source_of_key[key]
[docs] def __contains__(self, item): """Is key or source registered in Dataset""" if isinstance(item, ASource): return item in self._keys_of_source return item in self._source_of_key
[docs] def items(self): """Generate the pair of (keys_of_source, source) for all proxies""" for source, keys in self._keys_of_source.items(): yield list(keys), source
[docs] def keys(self): """Generate all source keys""" for source, keys in self._keys_of_source.items(): yield from keys
[docs] def values(self): """Generate all proxies""" for source, _ in self._keys_of_source.items(): yield source
[docs] def __len__(self): """Retrieve source count registered within this Dataset""" return len(self._keys_of_source)
# Spatial reference getters ***************************************************************** ** @property def proj4(self): """Dataset's work spatial reference in WKT proj4. Returns None if `mode 1`. """ if self._back.wkt_work is None: return None return osr.SpatialReference(self._back.wkt_work).ExportToProj4() @property def wkt(self): """Dataset's work spatial reference in WKT format. Returns None if `mode 1`. """ return self._back.wkt_work # Activation mechanisms ********************************************************************* ** @property def active_count(self): """Count how many driver objects are currently active""" return self._back.active_count()
[docs] def activate_all(self): """Activate all deactivable proxies. May raise an exception if the number of sources is greater than `max_activated` """ proxs = [ prox for prox in self._keys_of_source.keys() if isinstance(prox, APooledEmissary) ] total = len(proxs) if self._back.max_active < total: raise RuntimeError("Can't activate all pooled sources at the same time: {} pooled sources and max_activated is {}".format( total, self._back.max_active, )) # Hacky implementation to get the expected behavior # TODO: Implement that routine in the back driver pool. Is it possible? We need to call `.activate` i = 0 for prox in itertools.cycle(proxs): if i == total: break if not prox.activate() i = 1 else: i += 1
[docs] def deactivate_all(self): """Deactivate all deactivable proxies. Useful to flush all files to disk""" for prox in self._keys_of_source.keys(): if prox.deactivate()
# Pools infos ******************************************************************************* ** @property def pools(self): """Get the Pool Container. >>> help(PoolsContainer) """ return self._back.pools_container # Deprecation ******************************************************************************* ** open_araster = deprecation_pool.wrap_method( aopen_raster, '0.4.4' ) create_araster = deprecation_pool.wrap_method( acreate_raster, '0.4.4' ) open_avector = deprecation_pool.wrap_method( aopen_vector, '0.4.4' ) create_avector = deprecation_pool.wrap_method( acreate_vector, '0.4.4' )
# The end *********************************************************************************** ** # ******************************************************************************************* ** if sys.version_info < (3, 6): # for k, v in Dataset.__dict__.items(): if hasattr(v, '__set_name__'): v.__set_name__(Dataset, k)
[docs]def open_raster(*args, **kwargs): """Shortcut for `Dataset().aopen_raster` >>> help(Dataset.open_raster) See Also -------- - :py:func:`Dataset.open_raster` - :py:meth:`Dataset.aopen_raster` """ return Dataset().aopen_raster(*args, **kwargs)
[docs]def create_raster(*args, **kwargs): """Shortcut for `Dataset().acreate_raster` >>> help(Dataset.create_raster) See Also -------- - :py:func:`Dataset.create_raster` - :py:meth:`Dataset.acreate_raster` """ return Dataset().acreate_raster(*args, **kwargs)
[docs]def open_vector(*args, **kwargs): """Shortcut for `Dataset().aopen_vector` >>> help(Dataset.open_vector) See Also -------- - :py:func:`Dataset.open_vector` - :py:meth:`Dataset.aopen_vector` """ return Dataset().aopen_vector(*args, **kwargs)
[docs]def create_vector(*args, **kwargs): """Shortcut for `Dataset().acreate_vector` >>> help(Dataset.create_vector) See Also -------- - :py:func:`Dataset.create_vector` - :py:meth:`Dataset.acreate_vector` """ return Dataset().acreate_vector(*args, **kwargs)
[docs]def wrap_numpy_raster(*args, **kwargs): """Shortcut for `Dataset().awrap_numpy_raster` >>> help(Dataset.wrap_numpy_raster) See Also -------- - :py:func:`Dataset.wrap_numpy_raster` - :py:meth:`Dataset.awrap_numpy_raster` """ return Dataset().awrap_numpy_raster(*args, **kwargs)
_CloseRoutine = type('_CloseRoutine', (_tools.CallOrContext,), { '__doc__': Dataset.close.__doc__, }) DataSource = deprecation_pool.wrap_class(Dataset, 'DataSource', '0.6.0') class _AnonymousSentry: """Sentry object used to instanciate anonymous proxies"""