Add index generation tooling for detecting source images

2025-04-30 13:35:10 +00:00 · 2021-11-12 23:31:23 -05:00 · 2021-11-12 23:31:23 -05:00 · caa05e6e32
commit caa05e6e32
parent 56e683a94a
7 changed files with 295 additions and 10 deletions
--- a/kodak/application.py
+++ b/kodak/application.py
@ -1,6 +1,7 @@
 import flask_restful

 from kodak import resources
+from kodak import tools
 from kodak._server import initialize_database
 from kodak._server import KodakFlask
 from kodak._server import make_the_tea
@ -12,6 +13,7 @@ API = flask_restful.Api(APPLICATION, catch_all_404s=True)

 APPLICATION.before_request(make_the_tea)
 APPLICATION.before_first_request(initialize_database)
+APPLICATION.before_first_request(tools.index.build)

 for resource in resources.RESOURCES:
    API.add_resource(resource, *resource.routes)
--- a/kodak/constants.py
+++ b/kodak/constants.py
@ -1,4 +1,7 @@
 import enum
+from typing import Any
+from typing import Dict
+from typing import Set

 import peewee

@ -34,11 +37,11 @@ class CropAnchor(enum.Enum):
 class ImageFormat(enum.Enum):
    """Supported image conversion formats"""

-    JPEG = enum.auto()
-    PNG = enum.auto()
+    JPEG = ("jpg", "jpeg")
+    PNG = ("png",)


-DEFAULT_SQLITE_PRAGMAS = {
+DEFAULT_SQLITE_PRAGMAS: Dict[str, Any] = {
    "journal_mode": "wal",
    "cache_size": -1 * 64000,
    "foreign_keys": 1,
@ -46,4 +49,14 @@ DEFAULT_SQLITE_PRAGMAS = {
    "synchronous": 0,
 }

-DEFAULT_SUPPORTED_FORMATS = {ImageFormat.JPEG, ImageFormat.PNG}
+SQLITE_VARIABLE_LIMIT = 999
+
+DEFAULT_SUPPORTED_FORMATS: Set[ImageFormat] = {ImageFormat.JPEG, ImageFormat.PNG}
+
+IMAGE_PATH_NAME_SEPARATOR: str = "-"
+
+IMAGE_FILE_EXTENSIONS: Set[str] = set()
+
+for item in ImageFormat:
+    for ext in item.value:
+        IMAGE_FILE_EXTENSIONS.add(f".{ext}")
--- a/kodak/database/init.py
+++ b/kodak/database/init.py
@ -1,4 +1,5 @@
 import logging
+from typing import Sequence
 from typing import Tuple
 from typing import Type

@ -17,6 +18,51 @@ from kodak.database.image import ImageRecord
 MODELS: Tuple[Type[KodakModel], ...] = (ImageRecord, AliasRecord, AccessRecord)


+def calc_batch_size(
+    backend: constants.DatabaseBackend, models: Sequence[KodakModel]
+) -> int:
+    """Determine the batch size that should be used when performing queries
+
+    This is intended to work around the query variable limit in SQLite. Critically this is a
+    limit to the number of _variables_, not _records_ that can be referenced in a single query.
+
+    The "correct" way to calculate this is to iterate over the model list and tally the number of
+    changed fields, then add one for the table name, and each time you reach the
+    ``SQLITE_VARIABLE_LIMIT`` (which is a known constant) cut a new batch until all the models are
+    processed. This is very complicated because peewee doesn't provide a simple way to reliably
+    identify changed fields.
+
+    The naive way to calculate this (i.e. the way this function does it) is to determine the
+    maximum number of variables that _could be_ used to modify a record and use that as the
+    constant batch limiter. The theoretical maximum number of variables associated with a single
+    record is equal to the number of fields on that record, plus 1 (for the table name). This gives
+    the batch size (i.e. number of records that can be modified in a single query) as:
+
+    ::
+
+      999 / (len(fields) + 1)
+
+    Where ``fields`` is an array of the fields that could be written on the record.
+
+    .. note:: This function (pretty safely) assumes that all the records in ``models`` are of the
+              same model type; i.e. they all relate to the same table. This is a pretty safe
+              assumption since there's no way to do multi-table updates in a single query while
+              using sane SQL practices.
+
+    .. note:: This function just returns ``len(models)`` if the backend is anything other than
+              ``SQLITE``. This is because the limitation this works around is only applicable to
+              SQLite so on other platforms we can just make the batch size as large as possible.
+
+    :param backend: Backend being used by the application
+    :param models: Sequence of models that need to be batched
+    :returns: Number of models that can be processed in a single batch
+    """
+    # oof, the ratio of lines-of-docstring to lines-of-code in this function is 35:3
+    if models and backend == constants.DatabaseBackend.SQLITE:
+        return int(constants.SQLITE_VARIABLE_LIMIT / (len(models[0].fields) + 1))
+    return len(models)
+
+
 def initialize(config: KodakConfig):
    """Initialize the database interface

--- a/kodak/database/_shared.py
+++ b/kodak/database/_shared.py
@ -2,6 +2,8 @@ import datetime
 import enum
 import typing
 import uuid
+from pathlib import Path
+from typing import Dict
 from typing import NamedTuple
 from typing import Type

@ -48,7 +50,7 @@ class EnumField(peewee.CharField):
            raise peewee.IntegrityError(
                f"Enum {self.enumeration.__name__} has no value '{value}'"
            )
-        return value.name
+        return super().db_value(value.name)

    def python_value(self, value: str) -> enum.Enum:
        """Convert the stored string to the corresponding enum
@ -59,13 +61,25 @@ class EnumField(peewee.CharField):
        :returns: The enum item with the name passed to ``value``
        """
        try:
-            return self.enumeration[value]
+            return self.enumeration[super().python_value(value)]
        except KeyError:
            raise peewee.InterfaceError(
                f"Enum {self.enumeration.__name__} has no value with name '{value}'"
            ) from None


+class PathField(peewee.CharField):
+    """Field for storing paths in the database"""
+
+    def db_value(self, value: Path) -> str:
+        """Serialize a pathlib object to a database string"""
+        return super().db_value(str(value))
+
+    def python_value(self, value: str) -> Path:
+        """Serialize a database string to a pathlib object"""
+        return Path(super().python_value(value))
+
+
 class ChecksumField(peewee.CharField):
    """Field for storing checksum hashes in the database

@ -79,11 +93,11 @@ class ChecksumField(peewee.CharField):

    def db_value(self, value: Checksum) -> str:
        """Serialize the checkstum to a database string"""
-        return f"{value.algorithm}:{value.digest}"
+        return super().db_value(f"{value.algorithm}:{value.digest}")

    def python_value(self, value: str) -> Checksum:
        """Deserailize a string to a checksum container"""
-        alg, _, digest = value.partition(":")
+        alg, _, digest = super().python_value(value).partition(":")
        return Checksum(algorithm=alg, digest=digest)


@ -95,3 +109,9 @@ class KodakModel(peewee.Model):

    uuid = peewee.UUIDField(null=False, unique=True, default=uuid.uuid4)
    created = peewee.DateTimeField(null=False, default=datetime.datetime.utcnow)
+
+    @classmethod
+    @property
+    def fields(cls) -> Dict[str, peewee.Field]:
+        """Expose the peewee field metadata as a public object"""
+        return cls._meta.fields  # pylint: disable=protected-access
--- a/kodak/database/image.py
+++ b/kodak/database/image.py
@ -1,15 +1,80 @@
+import hashlib
+import os
+from pathlib import Path
+
 import peewee

+from kodak import configuration
 from kodak import constants
+from kodak.database._shared import Checksum
 from kodak.database._shared import ChecksumField
 from kodak.database._shared import EnumField
 from kodak.database._shared import KodakModel
+from kodak.database._shared import PathField


 class ImageRecord(KodakModel):
    """Model for source images"""

-    name = peewee.Charfield(null=False)
-    format = EnumField(constants.ImageFormat, null=False)
+    name = peewee.CharField(null=False)
+    source = PathField(null=False)
+    format_ = EnumField(constants.ImageFormat, null=False)
    deleted = peewee.BooleanField(null=False, default=False)
    checksum = ChecksumField(null=False)
+
+    @classmethod
+    def from_path(cls, config: configuration.KodakConfig, path: Path):
+        """Construct an image record from a path
+
+        :param config: Populated application configuration object
+        :param path: Full path to the image file to process. The file path provided is expected to
+                     already be absolute, with all symlinks and aliases resolved.
+
+        .. note:: This method attempts to _efficiently_ compute a hash of large image files. The
+                  hashing code was adapted from here:
+
+                  https://stackoverflow.com/a/44873382/5361209
+        """
+        hasher = hashlib.sha256()
+        view = memoryview(bytearray(1024 * 1024))
+        with path.open("rb", buffering=0) as infile:
+            for chunk in iter(lambda: infile.readinto(view), 0):
+                hasher.update(view[:chunk])
+
+        name = path.stem
+        extension = path.suffix
+
+        for item in constants.ImageFormat:
+            if extension.lower()[1:] in item.value:
+                format_ = item
+                break
+        else:
+            raise RuntimeError
+
+        name = name.replace(str(config.source_dir), "").replace(
+            os.sep, constants.IMAGE_PATH_NAME_SEPARATOR
+        )
+
+        return cls(
+            name=name, source=path, format_=format_, checksum=Checksum.from_hash(hasher)
+        )
+
+    def create_link(self, config: configuration.KodakConfig) -> Path:
+        """Creates a link between the content directory and source directory
+
+        :param config: Populated application configuration object
+        :returns: Path to the created symbolic link back to the source file
+        """
+        link = Path(config.content_dir, self.name)
+        try:
+            link.symlink_to(self.source)
+        except FileExistsError:
+            pass
+        return link
+
+    def remove_link(self, config: configuration.KodakConfig) -> None:
+        """Remove a link between the content and source directory
+
+        :param config: Populated application configuration object
+        """
+        Path(config.content_dir, self.name).unlink(missing_ok=True)
--- a/kodak/tools/init.py
+++ b/kodak/tools/init.py
@ -0,0 +1 @@
+from kodak.tools import index
--- a/kodak/tools/index.py
+++ b/kodak/tools/index.py
@ -0,0 +1,138 @@
+import logging
+from pathlib import Path
+from typing import List
+from typing import Optional
+
+from kodak import configuration
+from kodak import constants
+from kodak import database
+
+
+def identify(config: configuration.KodakConfig) -> List[database.ImageRecord]:
+    """Identify source images that will be made available
+
+    :param config: Populated application configuration object
+    :returns: List of (unsaved) database models representing identified source image files
+    """
+
+    def _identify(path: Path) -> List[Path]:
+        identified = []
+        for item in path.iterdir():
+            if item.is_file() and item.suffix in constants.IMAGE_FILE_EXTENSIONS:
+                logger.debug(f"Including file {item}")
+                identified.append(item)
+            elif item.is_dir():
+                logger.debug(f"Entering subdirectory {item}")
+                identified += _identify(item)
+            else:
+                logger.debug(f"Skipping {item}")
+        return identified
+
+    logger = logging.getLogger(__name__)
+
+    logger.info(
+        f"Identifying image files with extensions {', '.join(constants.IMAGE_FILE_EXTENSIONS)} under {config.source_dir}"
+    )
+
+    images = _identify(config.source_dir)
+
+    logger.info(f"Identified {len(images)} files under {config.source_dir}")
+
+    with database.interface.atomic():
+        existing = [
+            item.source
+            for item in database.ImageRecord.select(database.ImageRecord.source)
+        ]
+
+    logger.debug(f"Fetched {len(existing)} existing image records")
+
+    results = []
+    for image in images:
+        if image in existing:
+            logger.debug(f"Skipping existing {image}")
+        else:
+            logger.debug(f"Including newly identified image {image}")
+            results.append(database.ImageRecord.from_path(config, image))
+
+    return results
+
+
+def clean() -> List[database.ImageRecord]:
+    """Identify removed or changed source images and mark them as deleted
+
+    :param config: Populated application configuration object
+    :returns: List of (unsaved) database models representing source images that have been deleted
+              or removed
+    """
+
+    logger = logging.getLogger(__name__)
+
+    with database.interface.atomic():
+        existing = database.ImageRecord.select(database.ImageRecord.source).where(
+            database.ImageRecord.deleted  # pylint: disable=singleton-comparison
+            == False
+        )
+
+    logger.info(f"Identified {len(existing)} existing image records")
+
+    deleted = []
+    for item in existing:
+        if item.source.exists():
+            logger.debug(
+                f"Image file exists, record will not be modified: {item.source}"
+            )
+        else:
+            logger.debug(f"Image file removed, record will be deleted: {item.source}")
+            item.deleted = True
+            deleted.append(item)
+
+    logger.info(f"Identified {len(deleted)} image records to be marked as deleted")
+
+    return deleted
+
+
+def build(config: Optional[configuration.KodakConfig] = None) -> None:
+    """Build and update the file index
+
+    :param config: Populated application configuration object
+    """
+    logger = logging.getLogger(__name__)
+
+    config = config or configuration.load()
+
+    new_images = identify(config)
+    with database.interface.atomic():
+        database.ImageRecord.bulk_create(
+            new_images,
+            batch_size=database.calc_batch_size(config.database.backend, new_images),
+        )
+
+    removed_images = clean()
+    with database.interface.atomic():
+        database.ImageRecord.bulk_update(
+            removed_images,
+            fields=[database.ImageRecord.deleted],
+            batch_size=database.calc_batch_size(
+                config.database.backend, removed_images
+            ),
+        )
+
+    logger.info(f"Removing source links to {len(removed_images)} removed image files")
+
+    for image in removed_images:
+        logger.debug(f"Removing link to removed source image {image.source}")
+        image.remove_link(config)
+
+    logger.info("Processing source links")
+
+    with database.interface.atomic():
+        for image in database.ImageRecord.select().where(
+            database.ImageRecord.deleted
+            == False  # pylint: disable=singleton-comparison
+        ):
+            if config.expose_source:
+                logger.debug(f"Creating source link to {image.source}")
+                image.create_link(config)
+            else:
+                logger.debug(f"Removing source link to {image.source}")
+                image.delete_link(config)