Add index generation tooling for detecting source images

2025-04-06 09:53:32 +00:00 · 2021-11-12 23:31:23 -05:00 · 2021-11-12 23:31:23 -05:00 · caa05e6e32
commit caa05e6e32
parent 56e683a94a
7 changed files with 295 additions and 10 deletions
--- a/kodak/application.py
+++ b/kodak/application.py
@ -1,6 +1,7 @@
 import flask_restful
 from kodak import resources
 from kodak import tools
 from kodak._server import initialize_database
 from kodak._server import KodakFlask
 from kodak._server import make_the_tea
@ -12,6 +13,7 @@ API = flask_restful.Api(APPLICATION, catch_all_404s=True)
 APPLICATION.before_request(make_the_tea)
 APPLICATION.before_first_request(initialize_database)
 APPLICATION.before_first_request(tools.index.build)
 for resource in resources.RESOURCES:
    API.add_resource(resource, *resource.routes)
--- a/kodak/constants.py
+++ b/kodak/constants.py
@ -1,4 +1,7 @@
 import enum
 from typing import Any
 from typing import Dict
 from typing import Set
 import peewee
@ -34,11 +37,11 @@ class CropAnchor(enum.Enum):
 class ImageFormat(enum.Enum):
    """Supported image conversion formats"""
-    JPEG = enum.auto()
+    JPEG = ("jpg", "jpeg")
-    PNG = enum.auto()
+    PNG = ("png",)
-DEFAULT_SQLITE_PRAGMAS = {
+DEFAULT_SQLITE_PRAGMAS: Dict[str, Any] = {
    "journal_mode": "wal",
    "cache_size": -1 * 64000,
    "foreign_keys": 1,
@ -46,4 +49,14 @@ DEFAULT_SQLITE_PRAGMAS = {
    "synchronous": 0,
 }
-DEFAULT_SUPPORTED_FORMATS = {ImageFormat.JPEG, ImageFormat.PNG}
+SQLITE_VARIABLE_LIMIT = 999
 DEFAULT_SUPPORTED_FORMATS: Set[ImageFormat] = {ImageFormat.JPEG, ImageFormat.PNG}
 IMAGE_PATH_NAME_SEPARATOR: str = "-"
 IMAGE_FILE_EXTENSIONS: Set[str] = set()
 for item in ImageFormat:
    for ext in item.value:
        IMAGE_FILE_EXTENSIONS.add(f".{ext}")
--- a/kodak/database/init.py
+++ b/kodak/database/init.py
@ -1,4 +1,5 @@
 import logging
 from typing import Sequence
 from typing import Tuple
 from typing import Type
@ -17,6 +18,51 @@ from kodak.database.image import ImageRecord
 MODELS: Tuple[Type[KodakModel], ...] = (ImageRecord, AliasRecord, AccessRecord)
 def calc_batch_size(
    backend: constants.DatabaseBackend, models: Sequence[KodakModel]
 ) -> int:
    """Determine the batch size that should be used when performing queries
    This is intended to work around the query variable limit in SQLite. Critically this is a
    limit to the number of _variables_, not _records_ that can be referenced in a single query.
    The "correct" way to calculate this is to iterate over the model list and tally the number of
    changed fields, then add one for the table name, and each time you reach the
    ``SQLITE_VARIABLE_LIMIT`` (which is a known constant) cut a new batch until all the models are
    processed. This is very complicated because peewee doesn't provide a simple way to reliably
    identify changed fields.
    The naive way to calculate this (i.e. the way this function does it) is to determine the
    maximum number of variables that _could be_ used to modify a record and use that as the
    constant batch limiter. The theoretical maximum number of variables associated with a single
    record is equal to the number of fields on that record, plus 1 (for the table name). This gives
    the batch size (i.e. number of records that can be modified in a single query) as:
    ::
      999 / (len(fields) + 1)
    Where ``fields`` is an array of the fields that could be written on the record.
    .. note:: This function (pretty safely) assumes that all the records in ``models`` are of the
              same model type; i.e. they all relate to the same table. This is a pretty safe
              assumption since there's no way to do multi-table updates in a single query while
              using sane SQL practices.
    .. note:: This function just returns ``len(models)`` if the backend is anything other than
              ``SQLITE``. This is because the limitation this works around is only applicable to
              SQLite so on other platforms we can just make the batch size as large as possible.
    :param backend: Backend being used by the application
    :param models: Sequence of models that need to be batched
    :returns: Number of models that can be processed in a single batch
    """
    # oof, the ratio of lines-of-docstring to lines-of-code in this function is 35:3
    if models and backend == constants.DatabaseBackend.SQLITE:
        return int(constants.SQLITE_VARIABLE_LIMIT / (len(models[0].fields) + 1))
    return len(models)
 def initialize(config: KodakConfig):
    """Initialize the database interface
--- a/kodak/database/_shared.py
+++ b/kodak/database/_shared.py
@ -2,6 +2,8 @@ import datetime
 import enum
 import typing
 import uuid
 from pathlib import Path
 from typing import Dict
 from typing import NamedTuple
 from typing import Type
@ -48,7 +50,7 @@ class EnumField(peewee.CharField):
            raise peewee.IntegrityError(
                f"Enum {self.enumeration.__name__} has no value '{value}'"
            )
-        return value.name
+        return super().db_value(value.name)
    def python_value(self, value: str) -> enum.Enum:
        """Convert the stored string to the corresponding enum
@ -59,13 +61,25 @@ class EnumField(peewee.CharField):
        :returns: The enum item with the name passed to ``value``
        """
        try:
-            return self.enumeration[value]
+            return self.enumeration[super().python_value(value)]
        except KeyError:
            raise peewee.InterfaceError(
                f"Enum {self.enumeration.__name__} has no value with name '{value}'"
            ) from None
 class PathField(peewee.CharField):
    """Field for storing paths in the database"""
    def db_value(self, value: Path) -> str:
        """Serialize a pathlib object to a database string"""
        return super().db_value(str(value))
    def python_value(self, value: str) -> Path:
        """Serialize a database string to a pathlib object"""
        return Path(super().python_value(value))
 class ChecksumField(peewee.CharField):
    """Field for storing checksum hashes in the database
@ -79,11 +93,11 @@ class ChecksumField(peewee.CharField):
    def db_value(self, value: Checksum) -> str:
        """Serialize the checkstum to a database string"""
-        return f"{value.algorithm}:{value.digest}"
+        return super().db_value(f"{value.algorithm}:{value.digest}")
    def python_value(self, value: str) -> Checksum:
        """Deserailize a string to a checksum container"""
-        alg, _, digest = value.partition(":")
+        alg, _, digest = super().python_value(value).partition(":")
        return Checksum(algorithm=alg, digest=digest)
@ -95,3 +109,9 @@ class KodakModel(peewee.Model):
    uuid = peewee.UUIDField(null=False, unique=True, default=uuid.uuid4)
    created = peewee.DateTimeField(null=False, default=datetime.datetime.utcnow)
    @classmethod
    @property
    def fields(cls) -> Dict[str, peewee.Field]:
        """Expose the peewee field metadata as a public object"""
        return cls._meta.fields  # pylint: disable=protected-access
--- a/kodak/database/image.py
+++ b/kodak/database/image.py
@ -1,15 +1,80 @@
 import hashlib
 import os
 from pathlib import Path
 import peewee
 from kodak import configuration
 from kodak import constants
 from kodak.database._shared import Checksum
 from kodak.database._shared import ChecksumField
 from kodak.database._shared import EnumField
 from kodak.database._shared import KodakModel
 from kodak.database._shared import PathField
 class ImageRecord(KodakModel):
    """Model for source images"""
-    name = peewee.Charfield(null=False)
+    name = peewee.CharField(null=False)
-    format = EnumField(constants.ImageFormat, null=False)
+    source = PathField(null=False)
    format_ = EnumField(constants.ImageFormat, null=False)
    deleted = peewee.BooleanField(null=False, default=False)
    checksum = ChecksumField(null=False)
    @classmethod
    def from_path(cls, config: configuration.KodakConfig, path: Path):
        """Construct an image record from a path
        :param config: Populated application configuration object
        :param path: Full path to the image file to process. The file path provided is expected to
                     already be absolute, with all symlinks and aliases resolved.
        .. note:: This method attempts to _efficiently_ compute a hash of large image files. The
                  hashing code was adapted from here:
                  https://stackoverflow.com/a/44873382/5361209
        """
        hasher = hashlib.sha256()
        view = memoryview(bytearray(1024 * 1024))
        with path.open("rb", buffering=0) as infile:
            for chunk in iter(lambda: infile.readinto(view), 0):
                hasher.update(view[:chunk])
        name = path.stem
        extension = path.suffix
        for item in constants.ImageFormat:
            if extension.lower()[1:] in item.value:
                format_ = item
                break
        else:
            raise RuntimeError
        name = name.replace(str(config.source_dir), "").replace(
            os.sep, constants.IMAGE_PATH_NAME_SEPARATOR
        )
        return cls(
            name=name, source=path, format_=format_, checksum=Checksum.from_hash(hasher)
        )
    def create_link(self, config: configuration.KodakConfig) -> Path:
        """Creates a link between the content directory and source directory
        :param config: Populated application configuration object
        :returns: Path to the created symbolic link back to the source file
        """
        link = Path(config.content_dir, self.name)
        try:
            link.symlink_to(self.source)
        except FileExistsError:
            pass
        return link
    def remove_link(self, config: configuration.KodakConfig) -> None:
        """Remove a link between the content and source directory
        :param config: Populated application configuration object
        """
        Path(config.content_dir, self.name).unlink(missing_ok=True)
--- a/kodak/tools/init.py
+++ b/kodak/tools/init.py
@ -0,0 +1 @@
 from kodak.tools import index
--- a/kodak/tools/index.py
+++ b/kodak/tools/index.py
@ -0,0 +1,138 @@
 import logging
 from pathlib import Path
 from typing import List
 from typing import Optional
 from kodak import configuration
 from kodak import constants
 from kodak import database
 def identify(config: configuration.KodakConfig) -> List[database.ImageRecord]:
    """Identify source images that will be made available
    :param config: Populated application configuration object
    :returns: List of (unsaved) database models representing identified source image files
    """
    def _identify(path: Path) -> List[Path]:
        identified = []
        for item in path.iterdir():
            if item.is_file() and item.suffix in constants.IMAGE_FILE_EXTENSIONS:
                logger.debug(f"Including file {item}")
                identified.append(item)
            elif item.is_dir():
                logger.debug(f"Entering subdirectory {item}")
                identified += _identify(item)
            else:
                logger.debug(f"Skipping {item}")
        return identified
    logger = logging.getLogger(__name__)
    logger.info(
        f"Identifying image files with extensions {', '.join(constants.IMAGE_FILE_EXTENSIONS)} under {config.source_dir}"
    )
    images = _identify(config.source_dir)
    logger.info(f"Identified {len(images)} files under {config.source_dir}")
    with database.interface.atomic():
        existing = [
            item.source
            for item in database.ImageRecord.select(database.ImageRecord.source)
        ]
    logger.debug(f"Fetched {len(existing)} existing image records")
    results = []
    for image in images:
        if image in existing:
            logger.debug(f"Skipping existing {image}")
        else:
            logger.debug(f"Including newly identified image {image}")
            results.append(database.ImageRecord.from_path(config, image))
    return results
 def clean() -> List[database.ImageRecord]:
    """Identify removed or changed source images and mark them as deleted
    :param config: Populated application configuration object
    :returns: List of (unsaved) database models representing source images that have been deleted
              or removed
    """
    logger = logging.getLogger(__name__)
    with database.interface.atomic():
        existing = database.ImageRecord.select(database.ImageRecord.source).where(
            database.ImageRecord.deleted  # pylint: disable=singleton-comparison
            == False
        )
    logger.info(f"Identified {len(existing)} existing image records")
    deleted = []
    for item in existing:
        if item.source.exists():
            logger.debug(
                f"Image file exists, record will not be modified: {item.source}"
            )
        else:
            logger.debug(f"Image file removed, record will be deleted: {item.source}")
            item.deleted = True
            deleted.append(item)
    logger.info(f"Identified {len(deleted)} image records to be marked as deleted")
    return deleted
 def build(config: Optional[configuration.KodakConfig] = None) -> None:
    """Build and update the file index
    :param config: Populated application configuration object
    """
    logger = logging.getLogger(__name__)
    config = config or configuration.load()
    new_images = identify(config)
    with database.interface.atomic():
        database.ImageRecord.bulk_create(
            new_images,
            batch_size=database.calc_batch_size(config.database.backend, new_images),
        )
    removed_images = clean()
    with database.interface.atomic():
        database.ImageRecord.bulk_update(
            removed_images,
            fields=[database.ImageRecord.deleted],
            batch_size=database.calc_batch_size(
                config.database.backend, removed_images
            ),
        )
    logger.info(f"Removing source links to {len(removed_images)} removed image files")
    for image in removed_images:
        logger.debug(f"Removing link to removed source image {image.source}")
        image.remove_link(config)
    logger.info("Processing source links")
    with database.interface.atomic():
        for image in database.ImageRecord.select().where(
            database.ImageRecord.deleted
            == False  # pylint: disable=singleton-comparison
        ):
            if config.expose_source:
                logger.debug(f"Creating source link to {image.source}")
                image.create_link(config)
            else:
                logger.debug(f"Removing source link to {image.source}")
                image.delete_link(config)