Source code for mbo_utilities.file_io

from collections import defaultdict
from collections.abc import Sequence
import re

from pathlib import Path
import numpy as np

import dask.array as da
from tifffile import tifffile

from . import log

try:
    from zarr import open as zarr_open
    from zarr.storage import FsspecStore
    from fsspec.implementations.reference import ReferenceFileSystem

    HAS_ZARR = True
except ImportError:
    HAS_ZARR = False
    zarr_open = None
    ReferenceFileSystem = None
    FsspecStore = None

CHUNKS = {0: 1, 1: "auto", 2: -1, 3: -1}

logger = log.get("file_io")


[docs] def expand_paths(paths: str | Path | Sequence[str | Path]) -> list[Path]: r""" Expand a path, list of paths, or wildcard pattern into a sorted list of actual files. This is a handy wrapper for loading images or data files when you’ve got a folder, some wildcards, or a mix of both. Parameters ---------- paths : str, Path, or list of (str or Path) Can be a single path, a wildcard pattern like "\\*.tif", a folder, or a list of those. Returns ------- list of Path Sorted list of full paths to matching files. Examples -------- >>> expand_paths("data/\\*.tif") [Path("data/img_000.tif"), Path("data/img_001.tif"), ...] >>> expand_paths(Path("data")) [Path("data/img_000.tif"), Path("data/img_001.tif"), ...] >>> expand_paths(["data/\\*.tif", Path("more_data")]) [Path("data/img_000.tif"), Path("more_data/img_050.tif"), ...] """ if isinstance(paths, (str, Path)): paths = [paths] elif not isinstance(paths, (list, tuple)): raise TypeError(f"Expected str, Path, or sequence of them, got {type(paths)}") result = [] for p in paths: p = Path(p) if "*" in str(p): result.extend(p.parent.glob(p.name)) elif p.is_dir(): result.extend(p.glob("*")) elif p.exists() and p.is_file(): result.append(p) return sorted(p.resolve() for p in result if p.is_file())
def sort_by_si_filename(filename): """Sort ScanImage files by the last number in the filename (e.g., _00001, _00002, etc.).""" numbers = re.findall(r"\d+", str(filename)) return int(numbers[-1]) if numbers else 0 def _is_leaf_zarr(path: Path) -> bool: """Check if a .zarr directory is a leaf (no nested .zarr subdirs). A leaf zarr store contains chunk data (numbered directories like 0/, 1/) and metadata files, but no nested .zarr directories. """ if not path.is_dir() or path.suffix != ".zarr": return False # Check if any immediate subdirectory is also a .zarr try: for child in path.iterdir(): if child.is_dir() and child.suffix == ".zarr": return False # Has nested zarr, not a leaf except (PermissionError, OSError): pass return True def _walk_with_zarr_filter(base_path: Path, max_depth: int, exclude_dirs: list): """Walk directory tree, stopping at leaf .zarr directories. Yields files found during traversal. Does not recurse into: - Leaf .zarr directories (those without nested .zarr subdirs) - Excluded directories - Directories beyond max_depth """ len(base_path.parts) def _walk(current: Path, depth: int): if depth > max_depth: return try: entries = list(current.iterdir()) except (PermissionError, OSError): return for entry in entries: # Skip excluded directories if entry.name in exclude_dirs: continue if entry.is_file(): yield entry elif entry.is_dir(): # If this is a leaf .zarr store, yield it as a "file" (the store itself) # and don't recurse into its chunk directories if _is_leaf_zarr(entry): # Yield the zarr directory itself as if it were a file # (zarr stores are treated as single units) yield entry else: # Recurse into non-leaf directories yield from _walk(entry, depth + 1) yield from _walk(base_path, 0)
[docs] def get_files( base_dir, str_contains="", max_depth=1, sort_ascending=True, exclude_dirs=None ) -> list | Path: """ Recursively search for files in a specified directory whose names contain a given substring, limiting the search to a maximum subdirectory depth. Optionally, the resulting list of file paths is sorted in ascending order using numeric parts of the filenames when available. This function intelligently handles zarr stores: it stops recursing into leaf .zarr directories (those that don't contain nested .zarr subdirs) to avoid traversing thousands of internal chunk directories. Parameters ---------- base_dir : str or Path The base directory where the search begins. This path is expanded (e.g., '~' is resolved) and converted to an absolute path. str_contains : str, optional A substring that must be present in a file's name for it to be included in the result. If empty, all files are matched. max_depth : int, optional The maximum number of subdirectory levels (relative to the base directory) to search. Defaults to 1. If set to 0, it is automatically reset to 1. sort_ascending : bool, optional If True (default), the matched file paths are sorted in ascending alphanumeric order. The sort key extracts numeric parts from filenames so that, for example, "file2" comes before "file10". exclude_dirs : iterable of str or Path, optional An iterable of directories to exclude from the resulting list of file paths. By default will exclude ".venv/", "__pycache__/", ".git" and ".github"]. Returns ------- list of str A list of full file paths (as strings) for files within the base directory (and its subdirectories up to the specified depth) that contain the provided substring. Raises ------ FileNotFoundError If the base directory does not exist. NotADirectoryError If the specified base_dir is not a directory. Examples -------- >>> import mbo_utilities as mbo >>> # Get all files that contain "ops.npy" in their names by searching up to 3 levels deep: >>> ops_files = mbo.get_files("path/to/files", "ops.npy", max_depth=3) >>> # Get only files containing "tif" in the current directory (max_depth=1): >>> tif_files = mbo.get_files("path/to/files", "tif") """ # Handle UNC paths carefully - resolve() can break them on Windows base_path = Path(base_dir).expanduser() # Only resolve non-UNC paths (UNC paths start with \\) path_str = str(base_path) if not path_str.startswith("\\\\"): base_path = base_path.resolve() if not base_path.exists(): raise FileNotFoundError(f"Directory '{base_path}' does not exist.") if not base_path.is_dir(): raise NotADirectoryError(f"'{base_path}' is not a directory.") if max_depth == 0: max_depth = 1 if exclude_dirs is None: exclude_dirs = [".venv", ".git", "__pycache__"] # Use custom walk that handles zarr stores properly files = [] for entry in _walk_with_zarr_filter(base_path, max_depth, exclude_dirs): # Filter by str_contains if specified if str_contains and str_contains not in entry.name: continue files.append(entry) if sort_ascending: files.sort(key=sort_by_si_filename) return files
def load_npy(path): """Load .npy file with cross-platform Path object handling. Handles pickled PosixPath/WindowsPath objects that fail to load across operating systems. """ import pathlib import sys import pickle class CrossPlatformUnpickler(pickle.Unpickler): def find_class(self, module, name): if module == "pathlib": if name in ("PosixPath", "WindowsPath", "PurePosixPath", "PureWindowsPath"): return pathlib.Path return super().find_class(module, name) try: with open(path, "rb") as f: version = np.lib.format.read_magic(f) _shape, _fortran_order, dtype = np.lib.format._read_array_header(f, version) if dtype.hasobject: f.seek(0) np.lib.format.read_magic(f) np.lib.format._read_array_header(f, version) return CrossPlatformUnpickler(f).load() f.seek(0) return np.load(f, allow_pickle=True) except Exception: _original_posix = getattr(pathlib, "PosixPath", None) _original_windows = getattr(pathlib, "WindowsPath", None) try: if sys.platform == "win32": pathlib.PosixPath = pathlib.WindowsPath else: pathlib.WindowsPath = pathlib.PosixPath return np.load(path, allow_pickle=True) finally: if _original_posix is not None: pathlib.PosixPath = _original_posix if _original_windows is not None: pathlib.WindowsPath = _original_windows def get_last_savedir_path() -> Path: """Return path to settings file tracking last saved folder. .. deprecated:: Use :func:`mbo_utilities.preferences.get_last_save_dir` instead. """ import warnings warnings.warn( "get_last_savedir_path() is deprecated. Use mbo_utilities.preferences.get_last_save_dir() instead.", DeprecationWarning, stacklevel=2, ) return Path.home().joinpath(".mbo", "settings", "last_savedir.json")