Source code for zarr.core

import binascii
import hashlib
import itertools
import math
import operator
import re
from functools import reduce

import numpy as np
from numcodecs.compat import ensure_bytes, ensure_ndarray

from collections.abc import MutableMapping

from zarr.attrs import Attributes
from zarr.codecs import AsType, get_codec
from zarr.errors import ArrayNotFoundError, ReadOnlyError, ArrayIndexError
from zarr.indexing import (
    BasicIndexer,
    CoordinateIndexer,
    MaskIndexer,
    OIndex,
    OrthogonalIndexer,
    VIndex,
    PartialChunkIterator,
    check_fields,
    check_no_multi_fields,
    ensure_tuple,
    err_too_many_indices,
    is_contiguous_selection,
    is_pure_fancy_indexing,
    is_scalar,
    pop_fields,
)
from zarr.storage import array_meta_key, attrs_key, getsize, listdir, BaseStore
from zarr.util import (
    all_equal,
    InfoReporter,
    check_array_shape,
    human_readable_size,
    is_total_slice,
    nolock,
    normalize_chunks,
    normalize_resize_args,
    normalize_shape,
    normalize_storage_path,
    PartialReadBuffer,
)


# noinspection PyUnresolvedReferences
[docs]class Array: """Instantiate an array from an initialized store. Parameters ---------- store : MutableMapping Array store, already initialized. path : string, optional Storage path. read_only : bool, optional True if array should be protected against modification. chunk_store : MutableMapping, optional Separate storage for chunks. If not provided, `store` will be used for storage of both chunks and metadata. synchronizer : object, optional Array synchronizer. cache_metadata : bool, optional If True (default), array configuration metadata will be cached for the lifetime of the object. If False, array metadata will be reloaded prior to all data access and modification operations (may incur overhead depending on storage and data access pattern). cache_attrs : bool, optional If True (default), user attributes will be cached for attribute read operations. If False, user attributes are reloaded from the store prior to all attribute read operations. partial_decompress : bool, optional If True and while the chunk_store is a FSStore and the compression used is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. .. versionadded:: 2.7 write_empty_chunks : bool, optional If True (default), all chunks will be stored regardless of their contents. If False, each chunk is compared to the array's fill value prior to storing. If a chunk is uniformly equal to the fill value, then that chunk is not be stored, and the store entry for that chunk's key is deleted. This setting enables sparser storage, as only chunks with non-fill-value data are stored, at the expense of overhead associated with checking the data of each chunk. .. versionadded:: 2.11 Attributes ---------- store path name read_only chunk_store shape chunks dtype compression compression_opts dimension_separator fill_value order synchronizer filters attrs size itemsize nbytes nbytes_stored cdata_shape nchunks nchunks_initialized is_view info vindex oindex write_empty_chunks Methods ------- __getitem__ __setitem__ get_basic_selection set_basic_selection get_orthogonal_selection set_orthogonal_selection get_mask_selection set_mask_selection get_coordinate_selection set_coordinate_selection digest hexdigest resize append view astype """ def __init__( self, store: BaseStore, path=None, read_only=False, chunk_store=None, synchronizer=None, cache_metadata=True, cache_attrs=True, partial_decompress=False, write_empty_chunks=True, ): # N.B., expect at this point store is fully initialized with all # configuration metadata fully specified and normalized store = BaseStore._ensure_store(store) chunk_store = BaseStore._ensure_store(chunk_store) self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) if self._path: self._key_prefix = self._path + '/' else: self._key_prefix = '' self._read_only = bool(read_only) self._synchronizer = synchronizer self._cache_metadata = cache_metadata self._is_view = False self._partial_decompress = partial_decompress self._write_empty_chunks = write_empty_chunks # initialize metadata self._load_metadata() # initialize attributes akey = self._key_prefix + attrs_key self._attrs = Attributes(store, key=akey, read_only=read_only, synchronizer=synchronizer, cache=cache_attrs) # initialize info reporter self._info_reporter = InfoReporter(self) # initialize indexing helpers self._oindex = OIndex(self) self._vindex = VIndex(self) def _load_metadata(self): """(Re)load metadata from store.""" if self._synchronizer is None: self._load_metadata_nosync() else: mkey = self._key_prefix + array_meta_key with self._synchronizer[mkey]: self._load_metadata_nosync() def _load_metadata_nosync(self): try: mkey = self._key_prefix + array_meta_key meta_bytes = self._store[mkey] except KeyError: raise ArrayNotFoundError(self._path) else: # decode and store metadata as instance members meta = self._store._metadata_class.decode_array_metadata(meta_bytes) self._meta = meta self._shape = meta['shape'] self._chunks = meta['chunks'] self._dtype = meta['dtype'] self._fill_value = meta['fill_value'] self._order = meta['order'] dimension_separator = meta.get('dimension_separator', None) if dimension_separator is None: try: dimension_separator = self._store._dimension_separator except (AttributeError, KeyError): pass # Fallback for any stores which do not choose a default if dimension_separator is None: dimension_separator = "." self._dimension_separator = dimension_separator # setup compressor config = meta['compressor'] if config is None: self._compressor = None else: self._compressor = get_codec(config) # setup filters filters = meta['filters'] if filters: filters = [get_codec(config) for config in filters] self._filters = filters def _refresh_metadata(self): if not self._cache_metadata: self._load_metadata() def _refresh_metadata_nosync(self): if not self._cache_metadata and not self._is_view: self._load_metadata_nosync() def _flush_metadata_nosync(self): if self._is_view: raise PermissionError('operation not permitted for views') if self._compressor: compressor_config = self._compressor.get_config() else: compressor_config = None if self._filters: filters_config = [f.get_config() for f in self._filters] else: filters_config = None meta = dict(shape=self._shape, chunks=self._chunks, dtype=self._dtype, compressor=compressor_config, fill_value=self._fill_value, order=self._order, filters=filters_config) mkey = self._key_prefix + array_meta_key self._store[mkey] = self._store._metadata_class.encode_array_metadata(meta) @property def store(self): """A MutableMapping providing the underlying storage for the array.""" return self._store @property def path(self): """Storage path.""" return self._path @property def name(self): """Array name following h5py convention.""" if self.path: # follow h5py convention: add leading slash name = self.path if name[0] != '/': name = '/' + name return name return None @property def basename(self): """Final component of name.""" if self.name is not None: return self.name.split('/')[-1] return None @property def read_only(self): """A boolean, True if modification operations are not permitted.""" return self._read_only @read_only.setter def read_only(self, value): self._read_only = bool(value) @property def chunk_store(self): """A MutableMapping providing the underlying storage for array chunks.""" if self._chunk_store is None: return self._store else: return self._chunk_store @property def shape(self): """A tuple of integers describing the length of each dimension of the array.""" # N.B., shape may change if array is resized, hence need to refresh # metadata self._refresh_metadata() return self._shape @shape.setter def shape(self, value): self.resize(value) @property def chunks(self): """A tuple of integers describing the length of each dimension of a chunk of the array.""" return self._chunks @property def dtype(self): """The NumPy data type.""" return self._dtype @property def compressor(self): """Primary compression codec.""" return self._compressor @property def fill_value(self): """A value used for uninitialized portions of the array.""" return self._fill_value @fill_value.setter def fill_value(self, new): self._fill_value = new self._flush_metadata_nosync() @property def order(self): """A string indicating the order in which bytes are arranged within chunks of the array.""" return self._order @property def filters(self): """One or more codecs used to transform data prior to compression.""" return self._filters @property def synchronizer(self): """Object used to synchronize write access to the array.""" return self._synchronizer @property def attrs(self): """A MutableMapping containing user-defined attributes. Note that attribute values must be JSON serializable.""" return self._attrs @property def ndim(self): """Number of dimensions.""" return len(self._shape) @property def _size(self): return reduce(operator.mul, self._shape, 1) @property def size(self): """The total number of elements in the array.""" # N.B., this property depends on shape, and shape may change if array # is resized, hence need to refresh metadata self._refresh_metadata() return self._size @property def itemsize(self): """The size in bytes of each item in the array.""" return self.dtype.itemsize @property def _nbytes(self): return self._size * self.itemsize @property def nbytes(self): """The total number of bytes that would be required to store the array without compression.""" # N.B., this property depends on shape, and shape may change if array # is resized, hence need to refresh metadata self._refresh_metadata() return self._nbytes @property def nbytes_stored(self): """The total number of stored bytes of data for the array. This includes storage required for configuration metadata and user attributes.""" m = getsize(self._store, self._path) if self._chunk_store is None: return m else: n = getsize(self._chunk_store, self._path) if m < 0 or n < 0: return -1 else: return m + n @property def _cdata_shape(self): if self._shape == (): return 1, else: return tuple(math.ceil(s / c) for s, c in zip(self._shape, self._chunks)) @property def cdata_shape(self): """A tuple of integers describing the number of chunks along each dimension of the array.""" self._refresh_metadata() return self._cdata_shape @property def _nchunks(self): return reduce(operator.mul, self._cdata_shape, 1) @property def nchunks(self): """Total number of chunks.""" self._refresh_metadata() return self._nchunks @property def nchunks_initialized(self): """The number of chunks that have been initialized with some data.""" # key pattern for chunk keys prog = re.compile(r'\.'.join([r'\d+'] * min(1, self.ndim))) # count chunk keys return sum(1 for k in listdir(self.chunk_store, self._path) if prog.match(k)) # backwards compatibility initialized = nchunks_initialized @property def is_view(self): """A boolean, True if this array is a view on another array.""" return self._is_view @property def oindex(self): """Shortcut for orthogonal (outer) indexing, see :func:`get_orthogonal_selection` and :func:`set_orthogonal_selection` for documentation and examples.""" return self._oindex @property def vindex(self): """Shortcut for vectorized (inner) indexing, see :func:`get_coordinate_selection`, :func:`set_coordinate_selection`, :func:`get_mask_selection` and :func:`set_mask_selection` for documentation and examples.""" return self._vindex @property def write_empty_chunks(self) -> bool: """A Boolean, True if chunks composed of the array's fill value will be stored. If False, such chunks will not be stored. """ return self._write_empty_chunks def __eq__(self, other): return ( isinstance(other, Array) and self.store == other.store and self.read_only == other.read_only and self.path == other.path and not self._is_view # N.B., no need to compare other properties, should be covered by # store comparison ) def __array__(self, *args): a = self[...] if args: a = a.astype(args[0]) return a def islice(self, start=None, end=None): """ Yield a generator for iterating over the entire or parts of the array. Uses a cache so chunks only have to be decompressed once. Parameters ---------- start : int, optional Start index for the generator to start at. Defaults to 0. end : int, optional End index for the generator to stop at. Defaults to self.shape[0]. Yields ------ out : generator A generator that can be used to iterate over the requested region the array. Examples -------- Setup a 1-dimensional array:: >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100)) Iterate over part of the array: >>> for value in z.islice(25, 30): value; 25 26 27 28 29 """ if len(self.shape) == 0: # Same error as numpy raise TypeError("iteration over a 0-d array") if start is None: start = 0 if end is None or end > self.shape[0]: end = self.shape[0] if not isinstance(start, int) or start < 0: raise ValueError('start must be a nonnegative integer') if not isinstance(end, int) or end < 0: raise ValueError('end must be a nonnegative integer') # Avoid repeatedly decompressing chunks by iterating over the chunks # in the first dimension. chunk_size = self.chunks[0] chunk = None for j in range(start, end): if j % chunk_size == 0: chunk = self[j: j + chunk_size] # init chunk if we start offset of chunk borders elif chunk is None: chunk_start = j - j % chunk_size chunk_end = chunk_start + chunk_size chunk = self[chunk_start:chunk_end] yield chunk[j % chunk_size] def __iter__(self): return self.islice() def __len__(self): if self.shape: return self.shape[0] else: # 0-dimensional array, same error message as numpy raise TypeError('len() of unsized object')
[docs] def __getitem__(self, selection): """Retrieve data for an item or region of the array. Parameters ---------- selection : tuple An integer index or slice or tuple of int/slice objects specifying the requested item or region for each dimension of the array. Returns ------- out : ndarray A NumPy array containing the data for the requested region. Examples -------- Setup a 1-dimensional array:: >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100)) Retrieve a single item:: >>> z[5] 5 Retrieve a region via slicing:: >>> z[:5] array([0, 1, 2, 3, 4]) >>> z[-5:] array([95, 96, 97, 98, 99]) >>> z[5:10] array([5, 6, 7, 8, 9]) >>> z[5:10:2] array([5, 7, 9]) >>> z[::2] array([ 0, 2, 4, ..., 94, 96, 98]) Load the entire array into memory:: >>> z[...] array([ 0, 1, 2, ..., 97, 98, 99]) Setup a 2-dimensional array:: >>> z = zarr.array(np.arange(100).reshape(10, 10)) Retrieve an item:: >>> z[2, 2] 22 Retrieve a region via slicing:: >>> z[1:3, 1:3] array([[11, 12], [21, 22]]) >>> z[1:3, :] array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]) >>> z[:, 1:3] array([[ 1, 2], [11, 12], [21, 22], [31, 32], [41, 42], [51, 52], [61, 62], [71, 72], [81, 82], [91, 92]]) >>> z[0:5:2, 0:5:2] array([[ 0, 2, 4], [20, 22, 24], [40, 42, 44]]) >>> z[::2, ::2] array([[ 0, 2, 4, 6, 8], [20, 22, 24, 26, 28], [40, 42, 44, 46, 48], [60, 62, 64, 66, 68], [80, 82, 84, 86, 88]]) Load the entire array into memory:: >>> z[...] array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], [50, 51, 52, 53, 54, 55, 56, 57, 58, 59], [60, 61, 62, 63, 64, 65, 66, 67, 68, 69], [70, 71, 72, 73, 74, 75, 76, 77, 78, 79], [80, 81, 82, 83, 84, 85, 86, 87, 88, 89], [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]]) For arrays with a structured dtype, specific fields can be retrieved, e.g.:: >>> a = np.array([(b'aaa', 1, 4.2), ... (b'bbb', 2, 8.4), ... (b'ccc', 3, 12.6)], ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) >>> z = zarr.array(a) >>> z['foo'] array([b'aaa', b'bbb', b'ccc'], dtype='|S3') Notes ----- Slices with step > 1 are supported, but slices with negative step are not. Currently the implementation for __getitem__ is provided by :func:`vindex` if the indexing is pure fancy indexing (ie a broadcast-compatible tuple of integer array indices), or by :func:`set_basic_selection` otherwise. Effectively, this means that the following indexing modes are supported: - integer indexing - slice indexing - mixed slice and integer indexing - boolean indexing - fancy indexing (vectorized list of integers) For specific indexing options including outer indexing, see the methods listed under See Also. See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, set_orthogonal_selection, vindex, oindex, __setitem__ """ fields, pure_selection = pop_fields(selection) if is_pure_fancy_indexing(pure_selection, self.ndim): result = self.vindex[selection] else: result = self.get_basic_selection(pure_selection, fields=fields) return result
[docs] def get_basic_selection(self, selection=Ellipsis, out=None, fields=None): """Retrieve data for an item or region of the array. Parameters ---------- selection : tuple A tuple specifying the requested item or region for each dimension of the array. May be any combination of int and/or slice for multidimensional arrays. out : ndarray, optional If given, load the selected data directly into this array. fields : str or sequence of str, optional For arrays with a structured dtype, one or more fields can be specified to extract data for. Returns ------- out : ndarray A NumPy array containing the data for the requested region. Examples -------- Setup a 1-dimensional array:: >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100)) Retrieve a single item:: >>> z.get_basic_selection(5) 5 Retrieve a region via slicing:: >>> z.get_basic_selection(slice(5)) array([0, 1, 2, 3, 4]) >>> z.get_basic_selection(slice(-5, None)) array([95, 96, 97, 98, 99]) >>> z.get_basic_selection(slice(5, 10)) array([5, 6, 7, 8, 9]) >>> z.get_basic_selection(slice(5, 10, 2)) array([5, 7, 9]) >>> z.get_basic_selection(slice(None, None, 2)) array([ 0, 2, 4, ..., 94, 96, 98]) Setup a 2-dimensional array:: >>> z = zarr.array(np.arange(100).reshape(10, 10)) Retrieve an item:: >>> z.get_basic_selection((2, 2)) 22 Retrieve a region via slicing:: >>> z.get_basic_selection((slice(1, 3), slice(1, 3))) array([[11, 12], [21, 22]]) >>> z.get_basic_selection((slice(1, 3), slice(None))) array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]) >>> z.get_basic_selection((slice(None), slice(1, 3))) array([[ 1, 2], [11, 12], [21, 22], [31, 32], [41, 42], [51, 52], [61, 62], [71, 72], [81, 82], [91, 92]]) >>> z.get_basic_selection((slice(0, 5, 2), slice(0, 5, 2))) array([[ 0, 2, 4], [20, 22, 24], [40, 42, 44]]) >>> z.get_basic_selection((slice(None, None, 2), slice(None, None, 2))) array([[ 0, 2, 4, 6, 8], [20, 22, 24, 26, 28], [40, 42, 44, 46, 48], [60, 62, 64, 66, 68], [80, 82, 84, 86, 88]]) For arrays with a structured dtype, specific fields can be retrieved, e.g.:: >>> a = np.array([(b'aaa', 1, 4.2), ... (b'bbb', 2, 8.4), ... (b'ccc', 3, 12.6)], ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) >>> z = zarr.array(a) >>> z.get_basic_selection(slice(2), fields='foo') array([b'aaa', b'bbb'], dtype='|S3') Notes ----- Slices with step > 1 are supported, but slices with negative step are not. Currently this method provides the implementation for accessing data via the square bracket notation (__getitem__). See :func:`__getitem__` for examples using the alternative notation. See Also -------- set_basic_selection, get_mask_selection, set_mask_selection, get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, set_orthogonal_selection, vindex, oindex, __getitem__, __setitem__ """ # refresh metadata if not self._cache_metadata: self._load_metadata() # check args check_fields(fields, self._dtype) # handle zero-dimensional arrays if self._shape == (): return self._get_basic_selection_zd(selection=selection, out=out, fields=fields) else: return self._get_basic_selection_nd(selection=selection, out=out, fields=fields)
def _get_basic_selection_zd(self, selection, out=None, fields=None): # special case basic selection for zero-dimensional array # check selection is valid selection = ensure_tuple(selection) if selection not in ((), (Ellipsis,)): err_too_many_indices(selection, ()) try: # obtain encoded data for chunk ckey = self._chunk_key((0,)) cdata = self.chunk_store[ckey] except KeyError: # chunk not initialized chunk = np.zeros((), dtype=self._dtype) if self._fill_value is not None: chunk.fill(self._fill_value) else: chunk = self._decode_chunk(cdata) # handle fields if fields: chunk = chunk[fields] # handle selection of the scalar value via empty tuple if out is None: out = chunk[selection] else: out[selection] = chunk[selection] return out def _get_basic_selection_nd(self, selection, out=None, fields=None): # implementation of basic selection for array with at least one dimension # setup indexer indexer = BasicIndexer(selection, self) return self._get_selection(indexer=indexer, out=out, fields=fields)
[docs] def get_orthogonal_selection(self, selection, out=None, fields=None): """Retrieve data by making a selection for each dimension of the array. For example, if an array has 2 dimensions, allows selecting specific rows and/or columns. The selection for each dimension can be either an integer (indexing a single item), a slice, an array of integers, or a Boolean array where True values indicate a selection. Parameters ---------- selection : tuple A selection for each dimension of the array. May be any combination of int, slice, integer array or Boolean array. out : ndarray, optional If given, load the selected data directly into this array. fields : str or sequence of str, optional For arrays with a structured dtype, one or more fields can be specified to extract data for. Returns ------- out : ndarray A NumPy array containing the data for the requested selection. Examples -------- Setup a 2-dimensional array:: >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100).reshape(10, 10)) Retrieve rows and columns via any combination of int, slice, integer array and/or Boolean array:: >>> z.get_orthogonal_selection(([1, 4], slice(None))) array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]) >>> z.get_orthogonal_selection((slice(None), [1, 4])) array([[ 1, 4], [11, 14], [21, 24], [31, 34], [41, 44], [51, 54], [61, 64], [71, 74], [81, 84], [91, 94]]) >>> z.get_orthogonal_selection(([1, 4], [1, 4])) array([[11, 14], [41, 44]]) >>> sel = np.zeros(z.shape[0], dtype=bool) >>> sel[1] = True >>> sel[4] = True >>> z.get_orthogonal_selection((sel, sel)) array([[11, 14], [41, 44]]) For convenience, the orthogonal selection functionality is also available via the `oindex` property, e.g.:: >>> z.oindex[[1, 4], :] array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]) >>> z.oindex[:, [1, 4]] array([[ 1, 4], [11, 14], [21, 24], [31, 34], [41, 44], [51, 54], [61, 64], [71, 74], [81, 84], [91, 94]]) >>> z.oindex[[1, 4], [1, 4]] array([[11, 14], [41, 44]]) >>> sel = np.zeros(z.shape[0], dtype=bool) >>> sel[1] = True >>> sel[4] = True >>> z.oindex[sel, sel] array([[11, 14], [41, 44]]) Notes ----- Orthogonal indexing is also known as outer indexing. Slices with step > 1 are supported, but slices with negative step are not. See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, get_coordinate_selection, set_coordinate_selection, set_orthogonal_selection, vindex, oindex, __getitem__, __setitem__ """ # refresh metadata if not self._cache_metadata: self._load_metadata() # check args check_fields(fields, self._dtype) # setup indexer indexer = OrthogonalIndexer(selection, self) return self._get_selection(indexer=indexer, out=out, fields=fields)
[docs] def get_coordinate_selection(self, selection, out=None, fields=None): """Retrieve a selection of individual items, by providing the indices (coordinates) for each selected item. Parameters ---------- selection : tuple An integer (coordinate) array for each dimension of the array. out : ndarray, optional If given, load the selected data directly into this array. fields : str or sequence of str, optional For arrays with a structured dtype, one or more fields can be specified to extract data for. Returns ------- out : ndarray A NumPy array containing the data for the requested selection. Examples -------- Setup a 2-dimensional array:: >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100).reshape(10, 10)) Retrieve items by specifying their coordinates:: >>> z.get_coordinate_selection(([1, 4], [1, 4])) array([11, 44]) For convenience, the coordinate selection functionality is also available via the `vindex` property, e.g.:: >>> z.vindex[[1, 4], [1, 4]] array([11, 44]) Notes ----- Coordinate indexing is also known as point selection, and is a form of vectorized or inner indexing. Slices are not supported. Coordinate arrays must be provided for all dimensions of the array. Coordinate arrays may be multidimensional, in which case the output array will also be multidimensional. Coordinate arrays are broadcast against each other before being applied. The shape of the output will be the same as the shape of each coordinate array after broadcasting. See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, get_orthogonal_selection, set_orthogonal_selection, set_coordinate_selection, vindex, oindex, __getitem__, __setitem__ """ # refresh metadata if not self._cache_metadata: self._load_metadata() # check args check_fields(fields, self._dtype) # setup indexer indexer = CoordinateIndexer(selection, self) # handle output - need to flatten if out is not None: out = out.reshape(-1) out = self._get_selection(indexer=indexer, out=out, fields=fields) # restore shape out = out.reshape(indexer.sel_shape) return out
[docs] def get_mask_selection(self, selection, out=None, fields=None): """Retrieve a selection of individual items, by providing a Boolean array of the same shape as the array against which the selection is being made, where True values indicate a selected item. Parameters ---------- selection : ndarray, bool A Boolean array of the same shape as the array against which the selection is being made. out : ndarray, optional If given, load the selected data directly into this array. fields : str or sequence of str, optional For arrays with a structured dtype, one or more fields can be specified to extract data for. Returns ------- out : ndarray A NumPy array containing the data for the requested selection. Examples -------- Setup a 2-dimensional array:: >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100).reshape(10, 10)) Retrieve items by specifying a mask:: >>> sel = np.zeros_like(z, dtype=bool) >>> sel[1, 1] = True >>> sel[4, 4] = True >>> z.get_mask_selection(sel) array([11, 44]) For convenience, the mask selection functionality is also available via the `vindex` property, e.g.:: >>> z.vindex[sel] array([11, 44]) Notes ----- Mask indexing is a form of vectorized or inner indexing, and is equivalent to coordinate indexing. Internally the mask array is converted to coordinate arrays by calling `np.nonzero`. See Also -------- get_basic_selection, set_basic_selection, set_mask_selection, get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, set_coordinate_selection, vindex, oindex, __getitem__, __setitem__ """ # refresh metadata if not self._cache_metadata: self._load_metadata() # check args check_fields(fields, self._dtype) # setup indexer indexer = MaskIndexer(selection, self) return self._get_selection(indexer=indexer, out=out, fields=fields)
def _get_selection(self, indexer, out=None, fields=None): # We iterate over all chunks which overlap the selection and thus contain data # that needs to be extracted. Each chunk is processed in turn, extracting the # necessary data and storing into the correct location in the output array. # N.B., it is an important optimisation that we only visit chunks which overlap # the selection. This minimises the number of iterations in the main for loop. # check fields are sensible out_dtype = check_fields(fields, self._dtype) # determine output shape out_shape = indexer.shape # setup output array if out is None: out = np.empty(out_shape, dtype=out_dtype, order=self._order) else: check_array_shape('out', out, out_shape) # iterate over chunks if not hasattr(self.chunk_store, "getitems") or \ any(map(lambda x: x == 0, self.shape)): # sequentially get one key at a time from storage for chunk_coords, chunk_selection, out_selection in indexer: # load chunk selection into output array self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection, drop_axes=indexer.drop_axes, fields=fields) else: # allow storage to get multiple items at once lchunk_coords, lchunk_selection, lout_selection = zip(*indexer) self._chunk_getitems(lchunk_coords, lchunk_selection, out, lout_selection, drop_axes=indexer.drop_axes, fields=fields) if out.shape: return out else: return out[()]
[docs] def __setitem__(self, selection, value): """Modify data for an item or region of the array. Parameters ---------- selection : tuple An integer index or slice or tuple of int/slice specifying the requested region for each dimension of the array. value : scalar or array-like Value to be stored into the array. Examples -------- Setup a 1-dimensional array:: >>> import zarr >>> z = zarr.zeros(100, dtype=int) Set all array elements to the same scalar value:: >>> z[...] = 42 >>> z[...] array([42, 42, 42, ..., 42, 42, 42]) Set a portion of the array:: >>> z[:10] = np.arange(10) >>> z[-10:] = np.arange(10)[::-1] >>> z[...] array([ 0, 1, 2, ..., 2, 1, 0]) Setup a 2-dimensional array:: >>> z = zarr.zeros((5, 5), dtype=int) Set all array elements to the same scalar value:: >>> z[...] = 42 Set a portion of the array:: >>> z[0, :] = np.arange(z.shape[1]) >>> z[:, 0] = np.arange(z.shape[0]) >>> z[...] array([[ 0, 1, 2, 3, 4], [ 1, 42, 42, 42, 42], [ 2, 42, 42, 42, 42], [ 3, 42, 42, 42, 42], [ 4, 42, 42, 42, 42]]) For arrays with a structured dtype, specific fields can be modified, e.g.:: >>> a = np.array([(b'aaa', 1, 4.2), ... (b'bbb', 2, 8.4), ... (b'ccc', 3, 12.6)], ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) >>> z = zarr.array(a) >>> z['foo'] = b'zzz' >>> z[...] array([(b'zzz', 1, 4.2), (b'zzz', 2, 8.4), (b'zzz', 3, 12.6)], dtype=[('foo', 'S3'), ('bar', '<i4'), ('baz', '<f8')]) Notes ----- Slices with step > 1 are supported, but slices with negative step are not. Currently the implementation for __setitem__ is provided by :func:`vindex` if the indexing is pure fancy indexing (ie a broadcast-compatible tuple of integer array indices), or by :func:`set_basic_selection` otherwise. Effectively, this means that the following indexing modes are supported: - integer indexing - slice indexing - mixed slice and integer indexing - boolean indexing - fancy indexing (vectorized list of integers) For specific indexing options including outer indexing, see the methods listed under See Also. See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, set_orthogonal_selection, vindex, oindex, __getitem__ """ fields, pure_selection = pop_fields(selection) if is_pure_fancy_indexing(pure_selection, self.ndim): self.vindex[selection] = value else: self.set_basic_selection(pure_selection, value, fields=fields)
[docs] def set_basic_selection(self, selection, value, fields=None): """Modify data for an item or region of the array. Parameters ---------- selection : tuple An integer index or slice or tuple of int/slice specifying the requested region for each dimension of the array. value : scalar or array-like Value to be stored into the array. fields : str or sequence of str, optional For arrays with a structured dtype, one or more fields can be specified to set data for. Examples -------- Setup a 1-dimensional array:: >>> import zarr >>> import numpy as np >>> z = zarr.zeros(100, dtype=int) Set all array elements to the same scalar value:: >>> z.set_basic_selection(..., 42) >>> z[...] array([42, 42, 42, ..., 42, 42, 42]) Set a portion of the array:: >>> z.set_basic_selection(slice(10), np.arange(10)) >>> z.set_basic_selection(slice(-10, None), np.arange(10)[::-1]) >>> z[...] array([ 0, 1, 2, ..., 2, 1, 0]) Setup a 2-dimensional array:: >>> z = zarr.zeros((5, 5), dtype=int) Set all array elements to the same scalar value:: >>> z.set_basic_selection(..., 42) Set a portion of the array:: >>> z.set_basic_selection((0, slice(None)), np.arange(z.shape[1])) >>> z.set_basic_selection((slice(None), 0), np.arange(z.shape[0])) >>> z[...] array([[ 0, 1, 2, 3, 4], [ 1, 42, 42, 42, 42], [ 2, 42, 42, 42, 42], [ 3, 42, 42, 42, 42], [ 4, 42, 42, 42, 42]]) For arrays with a structured dtype, the `fields` parameter can be used to set data for a specific field, e.g.:: >>> a = np.array([(b'aaa', 1, 4.2), ... (b'bbb', 2, 8.4), ... (b'ccc', 3, 12.6)], ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) >>> z = zarr.array(a) >>> z.set_basic_selection(slice(0, 2), b'zzz', fields='foo') >>> z[:] array([(b'zzz', 1, 4.2), (b'zzz', 2, 8.4), (b'ccc', 3, 12.6)], dtype=[('foo', 'S3'), ('bar', '<i4'), ('baz', '<f8')]) Notes ----- This method provides the underlying implementation for modifying data via square bracket notation, see :func:`__setitem__` for equivalent examples using the alternative notation. See Also -------- get_basic_selection, get_mask_selection, set_mask_selection, get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, set_orthogonal_selection, vindex, oindex, __getitem__, __setitem__ """ # guard conditions if self._read_only: raise ReadOnlyError() # refresh metadata if not self._cache_metadata: self._load_metadata_nosync() # handle zero-dimensional arrays if self._shape == (): return self._set_basic_selection_zd(selection, value, fields=fields) else: return self._set_basic_selection_nd(selection, value, fields=fields)
[docs] def set_orthogonal_selection(self, selection, value, fields=None): """Modify data via a selection for each dimension of the array. Parameters ---------- selection : tuple A selection for each dimension of the array. May be any combination of int, slice, integer array or Boolean array. value : scalar or array-like Value to be stored into the array. fields : str or sequence of str, optional For arrays with a structured dtype, one or more fields can be specified to set data for. Examples -------- Setup a 2-dimensional array:: >>> import zarr >>> import numpy as np >>> z = zarr.zeros((5, 5), dtype=int) Set data for a selection of rows:: >>> z.set_orthogonal_selection(([1, 4], slice(None)), 1) >>> z[...] array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [1, 1, 1, 1, 1]]) Set data for a selection of columns:: >>> z.set_orthogonal_selection((slice(None), [1, 4]), 2) >>> z[...] array([[0, 2, 0, 0, 2], [1, 2, 1, 1, 2], [0, 2, 0, 0, 2], [0, 2, 0, 0, 2], [1, 2, 1, 1, 2]]) Set data for a selection of rows and columns:: >>> z.set_orthogonal_selection(([1, 4], [1, 4]), 3) >>> z[...] array([[0, 2, 0, 0, 2], [1, 3, 1, 1, 3], [0, 2, 0, 0, 2], [0, 2, 0, 0, 2], [1, 3, 1, 1, 3]]) For convenience, this functionality is also available via the `oindex` property. E.g.:: >>> z.oindex[[1, 4], [1, 4]] = 4 >>> z[...] array([[0, 2, 0, 0, 2], [1, 4, 1, 1, 4], [0, 2, 0, 0, 2], [0, 2, 0, 0, 2], [1, 4, 1, 1, 4]]) Notes ----- Orthogonal indexing is also known as outer indexing. Slices with step > 1 are supported, but slices with negative step are not. See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, vindex, oindex, __getitem__, __setitem__ """ # guard conditions if self._read_only: raise ReadOnlyError() # refresh metadata if not self._cache_metadata: self._load_metadata_nosync() # setup indexer indexer = OrthogonalIndexer(selection, self) self._set_selection(indexer, value, fields=fields)
[docs] def set_coordinate_selection(self, selection, value, fields=None): """Modify a selection of individual items, by providing the indices (coordinates) for each item to be modified. Parameters ---------- selection : tuple An integer (coordinate) array for each dimension of the array. value : scalar or array-like Value to be stored into the array. fields : str or sequence of str, optional For arrays with a structured dtype, one or more fields can be specified to set data for. Examples -------- Setup a 2-dimensional array:: >>> import zarr >>> import numpy as np >>> z = zarr.zeros((5, 5), dtype=int) Set data for a selection of items:: >>> z.set_coordinate_selection(([1, 4], [1, 4]), 1) >>> z[...] array([[0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 1]]) For convenience, this functionality is also available via the `vindex` property. E.g.:: >>> z.vindex[[1, 4], [1, 4]] = 2 >>> z[...] array([[0, 0, 0, 0, 0], [0, 2, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 2]]) Notes ----- Coordinate indexing is also known as point selection, and is a form of vectorized or inner indexing. Slices are not supported. Coordinate arrays must be provided for all dimensions of the array. See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, vindex, oindex, __getitem__, __setitem__ """ # guard conditions if self._read_only: raise ReadOnlyError() # refresh metadata if not self._cache_metadata: self._load_metadata_nosync() # setup indexer indexer = CoordinateIndexer(selection, self) # handle value - need to flatten if not is_scalar(value, self._dtype): value = np.asanyarray(value) if hasattr(value, 'shape') and len(value.shape) > 1: value = value.reshape(-1) self._set_selection(indexer, value, fields=fields)
[docs] def set_mask_selection(self, selection, value, fields=None): """Modify a selection of individual items, by providing a Boolean array of the same shape as the array against which the selection is being made, where True values indicate a selected item. Parameters ---------- selection : ndarray, bool A Boolean array of the same shape as the array against which the selection is being made. value : scalar or array-like Value to be stored into the array. fields : str or sequence of str, optional For arrays with a structured dtype, one or more fields can be specified to set data for. Examples -------- Setup a 2-dimensional array:: >>> import zarr >>> import numpy as np >>> z = zarr.zeros((5, 5), dtype=int) Set data for a selection of items:: >>> sel = np.zeros_like(z, dtype=bool) >>> sel[1, 1] = True >>> sel[4, 4] = True >>> z.set_mask_selection(sel, 1) >>> z[...] array([[0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 1]]) For convenience, this functionality is also available via the `vindex` property. E.g.:: >>> z.vindex[sel] = 2 >>> z[...] array([[0, 0, 0, 0, 0], [0, 2, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 2]]) Notes ----- Mask indexing is a form of vectorized or inner indexing, and is equivalent to coordinate indexing. Internally the mask array is converted to coordinate arrays by calling `np.nonzero`. See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, set_coordinate_selection, vindex, oindex, __getitem__, __setitem__ """ # guard conditions if self._read_only: raise ReadOnlyError() # refresh metadata if not self._cache_metadata: self._load_metadata_nosync() # setup indexer indexer = MaskIndexer(selection, self) self._set_selection(indexer, value, fields=fields)
def _set_basic_selection_zd(self, selection, value, fields=None): # special case __setitem__ for zero-dimensional array # check selection is valid selection = ensure_tuple(selection) if selection not in ((), (Ellipsis,)): err_too_many_indices(selection, self._shape) # check fields check_fields(fields, self._dtype) fields = check_no_multi_fields(fields) # obtain key for chunk ckey = self._chunk_key((0,)) # setup chunk try: # obtain compressed data for chunk cdata = self.chunk_store[ckey] except KeyError: # chunk not initialized chunk = np.zeros((), dtype=self._dtype) if self._fill_value is not None: chunk.fill(self._fill_value) else: # decode chunk chunk = self._decode_chunk(cdata).copy() # set value if fields: chunk[fields][selection] = value else: chunk[selection] = value # remove chunk if write_empty_chunks is false and it only contains the fill value if (not self.write_empty_chunks) and all_equal(self.fill_value, chunk): try: del self.chunk_store[ckey] return except Exception: # pragma: no cover # deleting failed, fallback to overwriting pass else: # encode and store cdata = self._encode_chunk(chunk) self.chunk_store[ckey] = cdata def _set_basic_selection_nd(self, selection, value, fields=None): # implementation of __setitem__ for array with at least one dimension # setup indexer indexer = BasicIndexer(selection, self) self._set_selection(indexer, value, fields=fields) def _set_selection(self, indexer, value, fields=None): # We iterate over all chunks which overlap the selection and thus contain data # that needs to be replaced. Each chunk is processed in turn, extracting the # necessary data from the value array and storing into the chunk array. # N.B., it is an important optimisation that we only visit chunks which overlap # the selection. This minimises the number of iterations in the main for loop. # check fields are sensible check_fields(fields, self._dtype) fields = check_no_multi_fields(fields) # determine indices of chunks overlapping the selection sel_shape = indexer.shape # check value shape if sel_shape == (): # setting a single item pass elif is_scalar(value, self._dtype): # setting a scalar value pass else: if not hasattr(value, 'shape'): value = np.asanyarray(value) check_array_shape('value', value, sel_shape) # iterate over chunks in range if not hasattr(self.store, "setitems") or self._synchronizer is not None \ or any(map(lambda x: x == 0, self.shape)): # iterative approach for chunk_coords, chunk_selection, out_selection in indexer: # extract data to store if sel_shape == (): chunk_value = value elif is_scalar(value, self._dtype): chunk_value = value else: chunk_value = value[out_selection] # handle missing singleton dimensions if indexer.drop_axes: item = [slice(None)] * self.ndim for a in indexer.drop_axes: item[a] = np.newaxis item = tuple(item) chunk_value = chunk_value[item] # put data self._chunk_setitem(chunk_coords, chunk_selection, chunk_value, fields=fields) else: lchunk_coords, lchunk_selection, lout_selection = zip(*indexer) chunk_values = [] for out_selection in lout_selection: if sel_shape == (): chunk_values.append(value) elif is_scalar(value, self._dtype): chunk_values.append(value) else: cv = value[out_selection] # handle missing singleton dimensions if indexer.drop_axes: # pragma: no cover item = [slice(None)] * self.ndim for a in indexer.drop_axes: item[a] = np.newaxis item = tuple(item) cv = chunk_value[item] chunk_values.append(cv) self._chunk_setitems(lchunk_coords, lchunk_selection, chunk_values, fields=fields) def _process_chunk( self, out, cdata, chunk_selection, drop_axes, out_is_ndarray, fields, out_selection, partial_read_decode=False, ): """Take binary data from storage and fill output array""" if (out_is_ndarray and not fields and is_contiguous_selection(out_selection) and is_total_slice(chunk_selection, self._chunks) and not self._filters and self._dtype != object): dest = out[out_selection] write_direct = ( dest.flags.writeable and ( (self._order == 'C' and dest.flags.c_contiguous) or (self._order == 'F' and dest.flags.f_contiguous) ) ) if write_direct: # optimization: we want the whole chunk, and the destination is # contiguous, so we can decompress directly from the chunk # into the destination array if self._compressor: if isinstance(cdata, PartialReadBuffer): cdata = cdata.read_full() self._compressor.decode(cdata, dest) else: chunk = ensure_ndarray(cdata).view(self._dtype) chunk = chunk.reshape(self._chunks, order=self._order) np.copyto(dest, chunk) return # decode chunk try: if partial_read_decode: cdata.prepare_chunk() # size of chunk tmp = np.empty(self._chunks, dtype=self.dtype) index_selection = PartialChunkIterator(chunk_selection, self.chunks) for start, nitems, partial_out_selection in index_selection: expected_shape = [ len( range(*partial_out_selection[i].indices(self.chunks[0] + 1)) ) if i < len(partial_out_selection) else dim for i, dim in enumerate(self.chunks) ] cdata.read_part(start, nitems) chunk_partial = self._decode_chunk( cdata.buff, start=start, nitems=nitems, expected_shape=expected_shape, ) tmp[partial_out_selection] = chunk_partial out[out_selection] = tmp[chunk_selection] return except ArrayIndexError: cdata = cdata.read_full() chunk = self._decode_chunk(cdata) # select data from chunk if fields: chunk = chunk[fields] tmp = chunk[chunk_selection] if drop_axes: tmp = np.squeeze(tmp, axis=drop_axes) # store selected data in output out[out_selection] = tmp def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes=None, fields=None): """Obtain part or whole of a chunk. Parameters ---------- chunk_coords : tuple of ints Indices of the chunk. chunk_selection : selection Location of region within the chunk to extract. out : ndarray Array to store result in. out_selection : selection Location of region within output array to store results in. drop_axes : tuple of ints Axes to squeeze out of the chunk. fields TODO """ out_is_ndarray = True try: out = ensure_ndarray(out) except TypeError: out_is_ndarray = False assert len(chunk_coords) == len(self._cdata_shape) # obtain key for chunk ckey = self._chunk_key(chunk_coords) try: # obtain compressed data for chunk cdata = self.chunk_store[ckey] except KeyError: # chunk not initialized if self._fill_value is not None: if fields: fill_value = self._fill_value[fields] else: fill_value = self._fill_value out[out_selection] = fill_value else: self._process_chunk(out, cdata, chunk_selection, drop_axes, out_is_ndarray, fields, out_selection) def _chunk_getitems(self, lchunk_coords, lchunk_selection, out, lout_selection, drop_axes=None, fields=None): """As _chunk_getitem, but for lists of chunks This gets called where the storage supports ``getitems``, so that it can decide how to fetch the keys, allowing concurrency. """ out_is_ndarray = True try: out = ensure_ndarray(out) except TypeError: # pragma: no cover out_is_ndarray = False ckeys = [self._chunk_key(ch) for ch in lchunk_coords] if ( self._partial_decompress and self._compressor and self._compressor.codec_id == "blosc" and hasattr(self._compressor, "decode_partial") and not fields and self.dtype != object and hasattr(self.chunk_store, "getitems") ): partial_read_decode = True cdatas = { ckey: PartialReadBuffer(ckey, self.chunk_store) for ckey in ckeys if ckey in self.chunk_store } else: partial_read_decode = False cdatas = self.chunk_store.getitems(ckeys, on_error="omit") for ckey, chunk_select, out_select in zip(ckeys, lchunk_selection, lout_selection): if ckey in cdatas: self._process_chunk( out, cdatas[ckey], chunk_select, drop_axes, out_is_ndarray, fields, out_select, partial_read_decode=partial_read_decode, ) else: # check exception type if self._fill_value is not None: if fields: fill_value = self._fill_value[fields] else: fill_value = self._fill_value out[out_select] = fill_value def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): ckeys = map(self._chunk_key, lchunk_coords) cdatas = {key: self._process_for_setitem(key, sel, val, fields=fields) for key, sel, val in zip(ckeys, lchunk_selection, values)} to_store = {} if not self.write_empty_chunks: empty_chunks = {k: v for k, v in cdatas.items() if all_equal(self.fill_value, v)} self._chunk_delitems(empty_chunks.keys()) nonempty_keys = cdatas.keys() - empty_chunks.keys() to_store = {k: self._encode_chunk(cdatas[k]) for k in nonempty_keys} else: to_store = {k: self._encode_chunk(v) for k, v in cdatas.items()} self.chunk_store.setitems(to_store) def _chunk_delitems(self, ckeys): if hasattr(self.store, "delitems"): self.store.delitems(ckeys) else: # pragma: no cover # exempting this branch from coverage as there are no extant stores # that will trigger this condition, but it's possible that they # will be developed in the future. tuple(map(self._chunk_delitem, ckeys)) def _chunk_delitem(self, ckey): """ Attempt to delete the value associated with ckey. """ try: del self.chunk_store[ckey] except KeyError: pass def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): """Replace part or whole of a chunk. Parameters ---------- chunk_coords : tuple of ints Indices of the chunk. chunk_selection : tuple of slices Location of region within the chunk. value : scalar or ndarray Value to set. """ if self._synchronizer is None: # no synchronization lock = nolock else: # synchronize on the chunk ckey = self._chunk_key(chunk_coords) lock = self._synchronizer[ckey] with lock: self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, fields=fields) def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): ckey = self._chunk_key(chunk_coords) cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) # attempt to delete chunk if it only contains the fill value if (not self.write_empty_chunks) and all_equal(self.fill_value, cdata): self._chunk_delitem(ckey) else: self.chunk_store[ckey] = self._encode_chunk(cdata) def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): if is_total_slice(chunk_selection, self._chunks) and not fields: # totally replace chunk # optimization: we are completely replacing the chunk, so no need # to access the existing chunk data if is_scalar(value, self._dtype): # setup array filled with value chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) chunk.fill(value) else: # ensure array is contiguous chunk = value.astype(self._dtype, order=self._order, copy=False) else: # partially replace the contents of this chunk try: # obtain compressed data for chunk cdata = self.chunk_store[ckey] except KeyError: # chunk not initialized if self._fill_value is not None: chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) chunk.fill(self._fill_value) elif self._dtype == object: chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) else: # N.B., use zeros here so any region beyond the array has consistent # and compressible data chunk = np.zeros(self._chunks, dtype=self._dtype, order=self._order) else: # decode chunk chunk = self._decode_chunk(cdata) if not chunk.flags.writeable: chunk = chunk.copy(order='K') # modify if fields: # N.B., currently multi-field assignment is not supported in numpy, so # this only works for a single field chunk[fields][chunk_selection] = value else: chunk[chunk_selection] = value return chunk def _chunk_key(self, chunk_coords): return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) def _decode_chunk(self, cdata, start=None, nitems=None, expected_shape=None): # decompress if self._compressor: # only decode requested items if ( all(x is not None for x in [start, nitems]) and self._compressor.codec_id == "blosc" ) and hasattr(self._compressor, "decode_partial"): chunk = self._compressor.decode_partial(cdata, start, nitems) else: chunk = self._compressor.decode(cdata) else: chunk = cdata # apply filters if self._filters: for f in reversed(self._filters): chunk = f.decode(chunk) # view as numpy array with correct dtype chunk = ensure_ndarray(chunk) # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening if self._dtype != object: chunk = chunk.view(self._dtype) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. # We cannot deal with object arrays unless there is an object # codec in the filter chain, i.e., a filter that converts from object # array to something else during encoding, and converts back to object # array during decoding. raise RuntimeError('cannot read object array without object codec') # ensure correct chunk shape chunk = chunk.reshape(-1, order='A') chunk = chunk.reshape(expected_shape or self._chunks, order=self._order) return chunk def _encode_chunk(self, chunk): # apply filters if self._filters: for f in self._filters: chunk = f.encode(chunk) # check object encoding if ensure_ndarray(chunk).dtype == object: raise RuntimeError('cannot write object array without object codec') # compress if self._compressor: cdata = self._compressor.encode(chunk) else: cdata = chunk # ensure in-memory data is immutable and easy to compare if isinstance(self.chunk_store, MutableMapping): cdata = ensure_bytes(cdata) return cdata def __repr__(self): t = type(self) r = '<{}.{}'.format(t.__module__, t.__name__) if self.name: r += ' %r' % self.name r += ' %s' % str(self.shape) r += ' %s' % self.dtype if self._read_only: r += ' read-only' r += '>' return r @property def info(self): """Report some diagnostic information about the array. Examples -------- >>> import zarr >>> z = zarr.zeros(1000000, chunks=100000, dtype='i4') >>> z.info Type : zarr.core.Array Data type : int32 Shape : (1000000,) Chunk shape : (100000,) Order : C Read-only : False Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) Store type : zarr.storage.KVStore No. bytes : 4000000 (3.8M) No. bytes stored : 320 Storage ratio : 12500.0 Chunks initialized : 0/10 """ return self._info_reporter def info_items(self): return self._synchronized_op(self._info_items_nosync) def _info_items_nosync(self): def typestr(o): return '{}.{}'.format(type(o).__module__, type(o).__name__) def bytestr(n): if n > 2**10: return '{} ({})'.format(n, human_readable_size(n)) else: return str(n) items = [] # basic info if self.name is not None: items += [('Name', self.name)] items += [ ('Type', typestr(self)), ('Data type', '%s' % self.dtype), ('Shape', str(self.shape)), ('Chunk shape', str(self.chunks)), ('Order', self.order), ('Read-only', str(self.read_only)), ] # filters if self.filters: for i, f in enumerate(self.filters): items += [('Filter [%s]' % i, repr(f))] # compressor items += [('Compressor', repr(self.compressor))] # synchronizer if self._synchronizer is not None: items += [('Synchronizer type', typestr(self._synchronizer))] # storage info items += [('Store type', typestr(self._store))] if self._chunk_store is not None: items += [('Chunk store type', typestr(self._chunk_store))] items += [('No. bytes', bytestr(self.nbytes))] if self.nbytes_stored > 0: items += [ ('No. bytes stored', bytestr(self.nbytes_stored)), ('Storage ratio', '%.1f' % (self.nbytes / self.nbytes_stored)), ] items += [ ('Chunks initialized', '{}/{}'.format(self.nchunks_initialized, self.nchunks)) ] return items
[docs] def digest(self, hashname="sha1"): """ Compute a checksum for the data. Default uses sha1 for speed. Examples -------- >>> import binascii >>> import zarr >>> z = zarr.empty(shape=(10000, 10000), chunks=(1000, 1000)) >>> binascii.hexlify(z.digest()) b'041f90bc7a571452af4f850a8ca2c6cddfa8a1ac' >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) >>> binascii.hexlify(z.digest()) b'7162d416d26a68063b66ed1f30e0a866e4abed60' >>> z = zarr.zeros(shape=(10000, 10000), dtype="u1", chunks=(1000, 1000)) >>> binascii.hexlify(z.digest()) b'cb387af37410ae5a3222e893cf3373e4e4f22816' """ h = hashlib.new(hashname) for i in itertools.product(*[range(s) for s in self.cdata_shape]): h.update(self.chunk_store.get(self._chunk_key(i), b"")) h.update(self.store.get(self._key_prefix + array_meta_key, b"")) h.update(self.store.get(self.attrs.key, b"")) checksum = h.digest() return checksum
[docs] def hexdigest(self, hashname="sha1"): """ Compute a checksum for the data. Default uses sha1 for speed. Examples -------- >>> import zarr >>> z = zarr.empty(shape=(10000, 10000), chunks=(1000, 1000)) >>> z.hexdigest() '041f90bc7a571452af4f850a8ca2c6cddfa8a1ac' >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) >>> z.hexdigest() '7162d416d26a68063b66ed1f30e0a866e4abed60' >>> z = zarr.zeros(shape=(10000, 10000), dtype="u1", chunks=(1000, 1000)) >>> z.hexdigest() 'cb387af37410ae5a3222e893cf3373e4e4f22816' """ checksum = binascii.hexlify(self.digest(hashname=hashname)) # This is a bytes object on Python 3 and we want a str. if type(checksum) is not str: checksum = checksum.decode('utf8') return checksum
def __getstate__(self): return (self._store, self._path, self._read_only, self._chunk_store, self._synchronizer, self._cache_metadata, self._attrs.cache, self._partial_decompress, self._write_empty_chunks) def __setstate__(self, state): self.__init__(*state) def _synchronized_op(self, f, *args, **kwargs): if self._synchronizer is None: # no synchronization lock = nolock else: # synchronize on the array mkey = self._key_prefix + array_meta_key lock = self._synchronizer[mkey] with lock: self._refresh_metadata_nosync() result = f(*args, **kwargs) return result def _write_op(self, f, *args, **kwargs): # guard condition if self._read_only: raise ReadOnlyError() return self._synchronized_op(f, *args, **kwargs)
[docs] def resize(self, *args): """Change the shape of the array by growing or shrinking one or more dimensions. Examples -------- >>> import zarr >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) >>> z.shape (10000, 10000) >>> z.resize(20000, 10000) >>> z.shape (20000, 10000) >>> z.resize(30000, 1000) >>> z.shape (30000, 1000) Notes ----- When resizing an array, the data are not rearranged in any way. If one or more dimensions are shrunk, any chunks falling outside the new array shape will be deleted from the underlying store. """ return self._write_op(self._resize_nosync, *args)
def _resize_nosync(self, *args): # normalize new shape argument old_shape = self._shape new_shape = normalize_resize_args(old_shape, *args) old_cdata_shape = self._cdata_shape # update metadata self._shape = new_shape self._flush_metadata_nosync() # determine the new number and arrangement of chunks chunks = self._chunks new_cdata_shape = tuple(math.ceil(s / c) for s, c in zip(new_shape, chunks)) # remove any chunks not within range chunk_store = self.chunk_store for cidx in itertools.product(*[range(n) for n in old_cdata_shape]): if all(i < c for i, c in zip(cidx, new_cdata_shape)): pass # keep the chunk else: key = self._chunk_key(cidx) try: del chunk_store[key] except KeyError: # chunk not initialized pass
[docs] def append(self, data, axis=0): """Append `data` to `axis`. Parameters ---------- data : array_like Data to be appended. axis : int Axis along which to append. Returns ------- new_shape : tuple Notes ----- The size of all dimensions other than `axis` must match between this array and `data`. Examples -------- >>> import numpy as np >>> import zarr >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) >>> z = zarr.array(a, chunks=(1000, 100)) >>> z.shape (10000, 1000) >>> z.append(a) (20000, 1000) >>> z.append(np.vstack([a, a]), axis=1) (20000, 2000) >>> z.shape (20000, 2000) """ return self._write_op(self._append_nosync, data, axis=axis)
def _append_nosync(self, data, axis=0): # ensure data is array-like if not hasattr(data, 'shape'): data = np.asanyarray(data) # ensure shapes are compatible for non-append dimensions self_shape_preserved = tuple(s for i, s in enumerate(self._shape) if i != axis) data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) if self_shape_preserved != data_shape_preserved: raise ValueError('shape of data to append is not compatible with the array; ' 'all dimensions must match except for the dimension being ' 'appended') # remember old shape old_shape = self._shape # determine new shape new_shape = tuple( self._shape[i] if i != axis else self._shape[i] + data.shape[i] for i in range(len(self._shape)) ) # resize self._resize_nosync(new_shape) # store data # noinspection PyTypeChecker append_selection = tuple( slice(None) if i != axis else slice(old_shape[i], new_shape[i]) for i in range(len(self._shape)) ) self[append_selection] = data return new_shape
[docs] def view(self, shape=None, chunks=None, dtype=None, fill_value=None, filters=None, read_only=None, synchronizer=None): """Return an array sharing the same data. Parameters ---------- shape : int or tuple of ints Array shape. chunks : int or tuple of ints, optional Chunk shape. dtype : string or dtype, optional NumPy dtype. fill_value : object Default value to use for uninitialized portions of the array. filters : sequence, optional Sequence of filters to use to encode chunk data prior to compression. read_only : bool, optional True if array should be protected against modification. synchronizer : object, optional Array synchronizer. Notes ----- WARNING: This is an experimental feature and should be used with care. There are plenty of ways to generate errors and/or cause data corruption. Examples -------- Bypass filters: >>> import zarr >>> import numpy as np >>> np.random.seed(42) >>> labels = ['female', 'male'] >>> data = np.random.choice(labels, size=10000) >>> filters = [zarr.Categorize(labels=labels, ... dtype=data.dtype, ... astype='u1')] >>> a = zarr.array(data, chunks=1000, filters=filters) >>> a[:] array(['female', 'male', 'female', ..., 'male', 'male', 'female'], dtype='<U6') >>> v = a.view(dtype='u1', filters=[]) >>> v.is_view True >>> v[:] array([1, 2, 1, ..., 2, 2, 1], dtype=uint8) Views can be used to modify data: >>> x = v[:] >>> x.sort() >>> v[:] = x >>> v[:] array([1, 1, 1, ..., 2, 2, 2], dtype=uint8) >>> a[:] array(['female', 'female', 'female', ..., 'male', 'male', 'male'], dtype='<U6') View as a different dtype with the same item size: >>> data = np.random.randint(0, 2, size=10000, dtype='u1') >>> a = zarr.array(data, chunks=1000) >>> a[:] array([0, 0, 1, ..., 1, 0, 0], dtype=uint8) >>> v = a.view(dtype=bool) >>> v[:] array([False, False, True, ..., True, False, False]) >>> np.all(a[:].view(dtype=bool) == v[:]) True An array can be viewed with a dtype with a different item size, however some care is needed to adjust the shape and chunk shape so that chunk data is interpreted correctly: >>> data = np.arange(10000, dtype='u2') >>> a = zarr.array(data, chunks=1000) >>> a[:10] array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint16) >>> v = a.view(dtype='u1', shape=20000, chunks=2000) >>> v[:10] array([0, 0, 1, 0, 2, 0, 3, 0, 4, 0], dtype=uint8) >>> np.all(a[:].view('u1') == v[:]) True Change fill value for uninitialized chunks: >>> a = zarr.full(10000, chunks=1000, fill_value=-1, dtype='i1') >>> a[:] array([-1, -1, -1, ..., -1, -1, -1], dtype=int8) >>> v = a.view(fill_value=42) >>> v[:] array([42, 42, 42, ..., 42, 42, 42], dtype=int8) Note that resizing or appending to views is not permitted: >>> a = zarr.empty(10000) >>> v = a.view() >>> try: ... v.resize(20000) ... except PermissionError as e: ... print(e) operation not permitted for views """ store = self._store chunk_store = self._chunk_store path = self._path if read_only is None: read_only = self._read_only if synchronizer is None: synchronizer = self._synchronizer a = Array(store=store, path=path, chunk_store=chunk_store, read_only=read_only, synchronizer=synchronizer, cache_metadata=True) a._is_view = True # allow override of some properties if dtype is None: dtype = self._dtype else: dtype = np.dtype(dtype) a._dtype = dtype if shape is None: shape = self._shape else: shape = normalize_shape(shape) a._shape = shape if chunks is not None: chunks = normalize_chunks(chunks, shape, dtype.itemsize) a._chunks = chunks if fill_value is not None: a._fill_value = fill_value if filters is not None: a._filters = filters return a
[docs] def astype(self, dtype): """Returns a view that does on the fly type conversion of the underlying data. Parameters ---------- dtype : string or dtype NumPy dtype. Notes ----- This method returns a new Array object which is a view on the same underlying chunk data. Modifying any data via the view is currently not permitted and will result in an error. This is an experimental feature and its behavior is subject to change in the future. See Also -------- Array.view Examples -------- >>> import zarr >>> import numpy as np >>> data = np.arange(100, dtype=np.uint8) >>> a = zarr.array(data, chunks=10) >>> a[:] array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=uint8) >>> v = a.astype(np.float32) >>> v.is_view True >>> v[:] array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., 66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76., 77., 78., 79., 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., 90., 91., 92., 93., 94., 95., 96., 97., 98., 99.], dtype=float32) """ dtype = np.dtype(dtype) filters = [] if self._filters: filters.extend(self._filters) filters.insert(0, AsType(encode_dtype=self._dtype, decode_dtype=dtype)) return self.view(filters=filters, dtype=dtype, read_only=True)