diff --git a/kodak/application.py b/kodak/application.py index d66b63f..c3cb7c0 100644 --- a/kodak/application.py +++ b/kodak/application.py @@ -1,6 +1,7 @@ import flask_restful from kodak import resources +from kodak import tools from kodak._server import initialize_database from kodak._server import KodakFlask from kodak._server import make_the_tea @@ -12,6 +13,7 @@ API = flask_restful.Api(APPLICATION, catch_all_404s=True) APPLICATION.before_request(make_the_tea) APPLICATION.before_first_request(initialize_database) +APPLICATION.before_first_request(tools.index.build) for resource in resources.RESOURCES: API.add_resource(resource, *resource.routes) diff --git a/kodak/constants.py b/kodak/constants.py index 91b2243..879b715 100644 --- a/kodak/constants.py +++ b/kodak/constants.py @@ -1,4 +1,7 @@ import enum +from typing import Any +from typing import Dict +from typing import Set import peewee @@ -34,11 +37,11 @@ class CropAnchor(enum.Enum): class ImageFormat(enum.Enum): """Supported image conversion formats""" - JPEG = enum.auto() - PNG = enum.auto() + JPEG = ("jpg", "jpeg") + PNG = ("png",) -DEFAULT_SQLITE_PRAGMAS = { +DEFAULT_SQLITE_PRAGMAS: Dict[str, Any] = { "journal_mode": "wal", "cache_size": -1 * 64000, "foreign_keys": 1, @@ -46,4 +49,14 @@ DEFAULT_SQLITE_PRAGMAS = { "synchronous": 0, } -DEFAULT_SUPPORTED_FORMATS = {ImageFormat.JPEG, ImageFormat.PNG} +SQLITE_VARIABLE_LIMIT = 999 + +DEFAULT_SUPPORTED_FORMATS: Set[ImageFormat] = {ImageFormat.JPEG, ImageFormat.PNG} + +IMAGE_PATH_NAME_SEPARATOR: str = "-" + +IMAGE_FILE_EXTENSIONS: Set[str] = set() + +for item in ImageFormat: + for ext in item.value: + IMAGE_FILE_EXTENSIONS.add(f".{ext}") diff --git a/kodak/database/__init__.py b/kodak/database/__init__.py index 93f5143..1b7b6b5 100644 --- a/kodak/database/__init__.py +++ b/kodak/database/__init__.py @@ -1,4 +1,5 @@ import logging +from typing import Sequence from typing import Tuple from typing import Type @@ -17,6 +18,51 @@ from kodak.database.image import ImageRecord MODELS: Tuple[Type[KodakModel], ...] = (ImageRecord, AliasRecord, AccessRecord) +def calc_batch_size( + backend: constants.DatabaseBackend, models: Sequence[KodakModel] +) -> int: + """Determine the batch size that should be used when performing queries + + This is intended to work around the query variable limit in SQLite. Critically this is a + limit to the number of _variables_, not _records_ that can be referenced in a single query. + + The "correct" way to calculate this is to iterate over the model list and tally the number of + changed fields, then add one for the table name, and each time you reach the + ``SQLITE_VARIABLE_LIMIT`` (which is a known constant) cut a new batch until all the models are + processed. This is very complicated because peewee doesn't provide a simple way to reliably + identify changed fields. + + The naive way to calculate this (i.e. the way this function does it) is to determine the + maximum number of variables that _could be_ used to modify a record and use that as the + constant batch limiter. The theoretical maximum number of variables associated with a single + record is equal to the number of fields on that record, plus 1 (for the table name). This gives + the batch size (i.e. number of records that can be modified in a single query) as: + + :: + + 999 / (len(fields) + 1) + + Where ``fields`` is an array of the fields that could be written on the record. + + .. note:: This function (pretty safely) assumes that all the records in ``models`` are of the + same model type; i.e. they all relate to the same table. This is a pretty safe + assumption since there's no way to do multi-table updates in a single query while + using sane SQL practices. + + .. note:: This function just returns ``len(models)`` if the backend is anything other than + ``SQLITE``. This is because the limitation this works around is only applicable to + SQLite so on other platforms we can just make the batch size as large as possible. + + :param backend: Backend being used by the application + :param models: Sequence of models that need to be batched + :returns: Number of models that can be processed in a single batch + """ + # oof, the ratio of lines-of-docstring to lines-of-code in this function is 35:3 + if models and backend == constants.DatabaseBackend.SQLITE: + return int(constants.SQLITE_VARIABLE_LIMIT / (len(models[0].fields) + 1)) + return len(models) + + def initialize(config: KodakConfig): """Initialize the database interface diff --git a/kodak/database/_shared.py b/kodak/database/_shared.py index bff09c9..9c0e813 100644 --- a/kodak/database/_shared.py +++ b/kodak/database/_shared.py @@ -2,6 +2,8 @@ import datetime import enum import typing import uuid +from pathlib import Path +from typing import Dict from typing import NamedTuple from typing import Type @@ -48,7 +50,7 @@ class EnumField(peewee.CharField): raise peewee.IntegrityError( f"Enum {self.enumeration.__name__} has no value '{value}'" ) - return value.name + return super().db_value(value.name) def python_value(self, value: str) -> enum.Enum: """Convert the stored string to the corresponding enum @@ -59,13 +61,25 @@ class EnumField(peewee.CharField): :returns: The enum item with the name passed to ``value`` """ try: - return self.enumeration[value] + return self.enumeration[super().python_value(value)] except KeyError: raise peewee.InterfaceError( f"Enum {self.enumeration.__name__} has no value with name '{value}'" ) from None +class PathField(peewee.CharField): + """Field for storing paths in the database""" + + def db_value(self, value: Path) -> str: + """Serialize a pathlib object to a database string""" + return super().db_value(str(value)) + + def python_value(self, value: str) -> Path: + """Serialize a database string to a pathlib object""" + return Path(super().python_value(value)) + + class ChecksumField(peewee.CharField): """Field for storing checksum hashes in the database @@ -79,11 +93,11 @@ class ChecksumField(peewee.CharField): def db_value(self, value: Checksum) -> str: """Serialize the checkstum to a database string""" - return f"{value.algorithm}:{value.digest}" + return super().db_value(f"{value.algorithm}:{value.digest}") def python_value(self, value: str) -> Checksum: """Deserailize a string to a checksum container""" - alg, _, digest = value.partition(":") + alg, _, digest = super().python_value(value).partition(":") return Checksum(algorithm=alg, digest=digest) @@ -95,3 +109,9 @@ class KodakModel(peewee.Model): uuid = peewee.UUIDField(null=False, unique=True, default=uuid.uuid4) created = peewee.DateTimeField(null=False, default=datetime.datetime.utcnow) + + @classmethod + @property + def fields(cls) -> Dict[str, peewee.Field]: + """Expose the peewee field metadata as a public object""" + return cls._meta.fields # pylint: disable=protected-access diff --git a/kodak/database/image.py b/kodak/database/image.py index eb25aa3..b11bf30 100644 --- a/kodak/database/image.py +++ b/kodak/database/image.py @@ -1,15 +1,80 @@ +import hashlib +import os +from pathlib import Path + import peewee +from kodak import configuration from kodak import constants +from kodak.database._shared import Checksum from kodak.database._shared import ChecksumField from kodak.database._shared import EnumField from kodak.database._shared import KodakModel +from kodak.database._shared import PathField class ImageRecord(KodakModel): """Model for source images""" - name = peewee.Charfield(null=False) - format = EnumField(constants.ImageFormat, null=False) + name = peewee.CharField(null=False) + source = PathField(null=False) + format_ = EnumField(constants.ImageFormat, null=False) deleted = peewee.BooleanField(null=False, default=False) checksum = ChecksumField(null=False) + + @classmethod + def from_path(cls, config: configuration.KodakConfig, path: Path): + """Construct an image record from a path + + :param config: Populated application configuration object + :param path: Full path to the image file to process. The file path provided is expected to + already be absolute, with all symlinks and aliases resolved. + + .. note:: This method attempts to _efficiently_ compute a hash of large image files. The + hashing code was adapted from here: + + https://stackoverflow.com/a/44873382/5361209 + """ + hasher = hashlib.sha256() + view = memoryview(bytearray(1024 * 1024)) + with path.open("rb", buffering=0) as infile: + for chunk in iter(lambda: infile.readinto(view), 0): + hasher.update(view[:chunk]) + + name = path.stem + extension = path.suffix + + for item in constants.ImageFormat: + if extension.lower()[1:] in item.value: + format_ = item + break + else: + raise RuntimeError + + name = name.replace(str(config.source_dir), "").replace( + os.sep, constants.IMAGE_PATH_NAME_SEPARATOR + ) + + return cls( + name=name, source=path, format_=format_, checksum=Checksum.from_hash(hasher) + ) + + def create_link(self, config: configuration.KodakConfig) -> Path: + """Creates a link between the content directory and source directory + + :param config: Populated application configuration object + :returns: Path to the created symbolic link back to the source file + """ + link = Path(config.content_dir, self.name) + try: + link.symlink_to(self.source) + except FileExistsError: + pass + return link + + def remove_link(self, config: configuration.KodakConfig) -> None: + """Remove a link between the content and source directory + + :param config: Populated application configuration object + """ + Path(config.content_dir, self.name).unlink(missing_ok=True) diff --git a/kodak/tools/__init__.py b/kodak/tools/__init__.py new file mode 100644 index 0000000..f323c94 --- /dev/null +++ b/kodak/tools/__init__.py @@ -0,0 +1 @@ +from kodak.tools import index diff --git a/kodak/tools/index.py b/kodak/tools/index.py new file mode 100644 index 0000000..3daf30b --- /dev/null +++ b/kodak/tools/index.py @@ -0,0 +1,138 @@ +import logging +from pathlib import Path +from typing import List +from typing import Optional + +from kodak import configuration +from kodak import constants +from kodak import database + + +def identify(config: configuration.KodakConfig) -> List[database.ImageRecord]: + """Identify source images that will be made available + + :param config: Populated application configuration object + :returns: List of (unsaved) database models representing identified source image files + """ + + def _identify(path: Path) -> List[Path]: + identified = [] + for item in path.iterdir(): + if item.is_file() and item.suffix in constants.IMAGE_FILE_EXTENSIONS: + logger.debug(f"Including file {item}") + identified.append(item) + elif item.is_dir(): + logger.debug(f"Entering subdirectory {item}") + identified += _identify(item) + else: + logger.debug(f"Skipping {item}") + return identified + + logger = logging.getLogger(__name__) + + logger.info( + f"Identifying image files with extensions {', '.join(constants.IMAGE_FILE_EXTENSIONS)} under {config.source_dir}" + ) + + images = _identify(config.source_dir) + + logger.info(f"Identified {len(images)} files under {config.source_dir}") + + with database.interface.atomic(): + existing = [ + item.source + for item in database.ImageRecord.select(database.ImageRecord.source) + ] + + logger.debug(f"Fetched {len(existing)} existing image records") + + results = [] + for image in images: + if image in existing: + logger.debug(f"Skipping existing {image}") + else: + logger.debug(f"Including newly identified image {image}") + results.append(database.ImageRecord.from_path(config, image)) + + return results + + +def clean() -> List[database.ImageRecord]: + """Identify removed or changed source images and mark them as deleted + + :param config: Populated application configuration object + :returns: List of (unsaved) database models representing source images that have been deleted + or removed + """ + + logger = logging.getLogger(__name__) + + with database.interface.atomic(): + existing = database.ImageRecord.select(database.ImageRecord.source).where( + database.ImageRecord.deleted # pylint: disable=singleton-comparison + == False + ) + + logger.info(f"Identified {len(existing)} existing image records") + + deleted = [] + for item in existing: + if item.source.exists(): + logger.debug( + f"Image file exists, record will not be modified: {item.source}" + ) + else: + logger.debug(f"Image file removed, record will be deleted: {item.source}") + item.deleted = True + deleted.append(item) + + logger.info(f"Identified {len(deleted)} image records to be marked as deleted") + + return deleted + + +def build(config: Optional[configuration.KodakConfig] = None) -> None: + """Build and update the file index + + :param config: Populated application configuration object + """ + logger = logging.getLogger(__name__) + + config = config or configuration.load() + + new_images = identify(config) + with database.interface.atomic(): + database.ImageRecord.bulk_create( + new_images, + batch_size=database.calc_batch_size(config.database.backend, new_images), + ) + + removed_images = clean() + with database.interface.atomic(): + database.ImageRecord.bulk_update( + removed_images, + fields=[database.ImageRecord.deleted], + batch_size=database.calc_batch_size( + config.database.backend, removed_images + ), + ) + + logger.info(f"Removing source links to {len(removed_images)} removed image files") + + for image in removed_images: + logger.debug(f"Removing link to removed source image {image.source}") + image.remove_link(config) + + logger.info("Processing source links") + + with database.interface.atomic(): + for image in database.ImageRecord.select().where( + database.ImageRecord.deleted + == False # pylint: disable=singleton-comparison + ): + if config.expose_source: + logger.debug(f"Creating source link to {image.source}") + image.create_link(config) + else: + logger.debug(f"Removing source link to {image.source}") + image.delete_link(config)