Add index generation tooling for detecting source images

This commit is contained in:
Ethan Paul 2021-11-12 23:31:23 -05:00
parent 56e683a94a
commit caa05e6e32
No known key found for this signature in database
GPG Key ID: D0E2CBF1245E92BF
7 changed files with 295 additions and 10 deletions

View File

@ -1,6 +1,7 @@
import flask_restful
from kodak import resources
from kodak import tools
from kodak._server import initialize_database
from kodak._server import KodakFlask
from kodak._server import make_the_tea
@ -12,6 +13,7 @@ API = flask_restful.Api(APPLICATION, catch_all_404s=True)
APPLICATION.before_request(make_the_tea)
APPLICATION.before_first_request(initialize_database)
APPLICATION.before_first_request(tools.index.build)
for resource in resources.RESOURCES:
API.add_resource(resource, *resource.routes)

View File

@ -1,4 +1,7 @@
import enum
from typing import Any
from typing import Dict
from typing import Set
import peewee
@ -34,11 +37,11 @@ class CropAnchor(enum.Enum):
class ImageFormat(enum.Enum):
"""Supported image conversion formats"""
JPEG = enum.auto()
PNG = enum.auto()
JPEG = ("jpg", "jpeg")
PNG = ("png",)
DEFAULT_SQLITE_PRAGMAS = {
DEFAULT_SQLITE_PRAGMAS: Dict[str, Any] = {
"journal_mode": "wal",
"cache_size": -1 * 64000,
"foreign_keys": 1,
@ -46,4 +49,14 @@ DEFAULT_SQLITE_PRAGMAS = {
"synchronous": 0,
}
DEFAULT_SUPPORTED_FORMATS = {ImageFormat.JPEG, ImageFormat.PNG}
SQLITE_VARIABLE_LIMIT = 999
DEFAULT_SUPPORTED_FORMATS: Set[ImageFormat] = {ImageFormat.JPEG, ImageFormat.PNG}
IMAGE_PATH_NAME_SEPARATOR: str = "-"
IMAGE_FILE_EXTENSIONS: Set[str] = set()
for item in ImageFormat:
for ext in item.value:
IMAGE_FILE_EXTENSIONS.add(f".{ext}")

View File

@ -1,4 +1,5 @@
import logging
from typing import Sequence
from typing import Tuple
from typing import Type
@ -17,6 +18,51 @@ from kodak.database.image import ImageRecord
MODELS: Tuple[Type[KodakModel], ...] = (ImageRecord, AliasRecord, AccessRecord)
def calc_batch_size(
backend: constants.DatabaseBackend, models: Sequence[KodakModel]
) -> int:
"""Determine the batch size that should be used when performing queries
This is intended to work around the query variable limit in SQLite. Critically this is a
limit to the number of _variables_, not _records_ that can be referenced in a single query.
The "correct" way to calculate this is to iterate over the model list and tally the number of
changed fields, then add one for the table name, and each time you reach the
``SQLITE_VARIABLE_LIMIT`` (which is a known constant) cut a new batch until all the models are
processed. This is very complicated because peewee doesn't provide a simple way to reliably
identify changed fields.
The naive way to calculate this (i.e. the way this function does it) is to determine the
maximum number of variables that _could be_ used to modify a record and use that as the
constant batch limiter. The theoretical maximum number of variables associated with a single
record is equal to the number of fields on that record, plus 1 (for the table name). This gives
the batch size (i.e. number of records that can be modified in a single query) as:
::
999 / (len(fields) + 1)
Where ``fields`` is an array of the fields that could be written on the record.
.. note:: This function (pretty safely) assumes that all the records in ``models`` are of the
same model type; i.e. they all relate to the same table. This is a pretty safe
assumption since there's no way to do multi-table updates in a single query while
using sane SQL practices.
.. note:: This function just returns ``len(models)`` if the backend is anything other than
``SQLITE``. This is because the limitation this works around is only applicable to
SQLite so on other platforms we can just make the batch size as large as possible.
:param backend: Backend being used by the application
:param models: Sequence of models that need to be batched
:returns: Number of models that can be processed in a single batch
"""
# oof, the ratio of lines-of-docstring to lines-of-code in this function is 35:3
if models and backend == constants.DatabaseBackend.SQLITE:
return int(constants.SQLITE_VARIABLE_LIMIT / (len(models[0].fields) + 1))
return len(models)
def initialize(config: KodakConfig):
"""Initialize the database interface

View File

@ -2,6 +2,8 @@ import datetime
import enum
import typing
import uuid
from pathlib import Path
from typing import Dict
from typing import NamedTuple
from typing import Type
@ -48,7 +50,7 @@ class EnumField(peewee.CharField):
raise peewee.IntegrityError(
f"Enum {self.enumeration.__name__} has no value '{value}'"
)
return value.name
return super().db_value(value.name)
def python_value(self, value: str) -> enum.Enum:
"""Convert the stored string to the corresponding enum
@ -59,13 +61,25 @@ class EnumField(peewee.CharField):
:returns: The enum item with the name passed to ``value``
"""
try:
return self.enumeration[value]
return self.enumeration[super().python_value(value)]
except KeyError:
raise peewee.InterfaceError(
f"Enum {self.enumeration.__name__} has no value with name '{value}'"
) from None
class PathField(peewee.CharField):
"""Field for storing paths in the database"""
def db_value(self, value: Path) -> str:
"""Serialize a pathlib object to a database string"""
return super().db_value(str(value))
def python_value(self, value: str) -> Path:
"""Serialize a database string to a pathlib object"""
return Path(super().python_value(value))
class ChecksumField(peewee.CharField):
"""Field for storing checksum hashes in the database
@ -79,11 +93,11 @@ class ChecksumField(peewee.CharField):
def db_value(self, value: Checksum) -> str:
"""Serialize the checkstum to a database string"""
return f"{value.algorithm}:{value.digest}"
return super().db_value(f"{value.algorithm}:{value.digest}")
def python_value(self, value: str) -> Checksum:
"""Deserailize a string to a checksum container"""
alg, _, digest = value.partition(":")
alg, _, digest = super().python_value(value).partition(":")
return Checksum(algorithm=alg, digest=digest)
@ -95,3 +109,9 @@ class KodakModel(peewee.Model):
uuid = peewee.UUIDField(null=False, unique=True, default=uuid.uuid4)
created = peewee.DateTimeField(null=False, default=datetime.datetime.utcnow)
@classmethod
@property
def fields(cls) -> Dict[str, peewee.Field]:
"""Expose the peewee field metadata as a public object"""
return cls._meta.fields # pylint: disable=protected-access

View File

@ -1,15 +1,80 @@
import hashlib
import os
from pathlib import Path
import peewee
from kodak import configuration
from kodak import constants
from kodak.database._shared import Checksum
from kodak.database._shared import ChecksumField
from kodak.database._shared import EnumField
from kodak.database._shared import KodakModel
from kodak.database._shared import PathField
class ImageRecord(KodakModel):
"""Model for source images"""
name = peewee.Charfield(null=False)
format = EnumField(constants.ImageFormat, null=False)
name = peewee.CharField(null=False)
source = PathField(null=False)
format_ = EnumField(constants.ImageFormat, null=False)
deleted = peewee.BooleanField(null=False, default=False)
checksum = ChecksumField(null=False)
@classmethod
def from_path(cls, config: configuration.KodakConfig, path: Path):
"""Construct an image record from a path
:param config: Populated application configuration object
:param path: Full path to the image file to process. The file path provided is expected to
already be absolute, with all symlinks and aliases resolved.
.. note:: This method attempts to _efficiently_ compute a hash of large image files. The
hashing code was adapted from here:
https://stackoverflow.com/a/44873382/5361209
"""
hasher = hashlib.sha256()
view = memoryview(bytearray(1024 * 1024))
with path.open("rb", buffering=0) as infile:
for chunk in iter(lambda: infile.readinto(view), 0):
hasher.update(view[:chunk])
name = path.stem
extension = path.suffix
for item in constants.ImageFormat:
if extension.lower()[1:] in item.value:
format_ = item
break
else:
raise RuntimeError
name = name.replace(str(config.source_dir), "").replace(
os.sep, constants.IMAGE_PATH_NAME_SEPARATOR
)
return cls(
name=name, source=path, format_=format_, checksum=Checksum.from_hash(hasher)
)
def create_link(self, config: configuration.KodakConfig) -> Path:
"""Creates a link between the content directory and source directory
:param config: Populated application configuration object
:returns: Path to the created symbolic link back to the source file
"""
link = Path(config.content_dir, self.name)
try:
link.symlink_to(self.source)
except FileExistsError:
pass
return link
def remove_link(self, config: configuration.KodakConfig) -> None:
"""Remove a link between the content and source directory
:param config: Populated application configuration object
"""
Path(config.content_dir, self.name).unlink(missing_ok=True)

1
kodak/tools/__init__.py Normal file
View File

@ -0,0 +1 @@
from kodak.tools import index

138
kodak/tools/index.py Normal file
View File

@ -0,0 +1,138 @@
import logging
from pathlib import Path
from typing import List
from typing import Optional
from kodak import configuration
from kodak import constants
from kodak import database
def identify(config: configuration.KodakConfig) -> List[database.ImageRecord]:
"""Identify source images that will be made available
:param config: Populated application configuration object
:returns: List of (unsaved) database models representing identified source image files
"""
def _identify(path: Path) -> List[Path]:
identified = []
for item in path.iterdir():
if item.is_file() and item.suffix in constants.IMAGE_FILE_EXTENSIONS:
logger.debug(f"Including file {item}")
identified.append(item)
elif item.is_dir():
logger.debug(f"Entering subdirectory {item}")
identified += _identify(item)
else:
logger.debug(f"Skipping {item}")
return identified
logger = logging.getLogger(__name__)
logger.info(
f"Identifying image files with extensions {', '.join(constants.IMAGE_FILE_EXTENSIONS)} under {config.source_dir}"
)
images = _identify(config.source_dir)
logger.info(f"Identified {len(images)} files under {config.source_dir}")
with database.interface.atomic():
existing = [
item.source
for item in database.ImageRecord.select(database.ImageRecord.source)
]
logger.debug(f"Fetched {len(existing)} existing image records")
results = []
for image in images:
if image in existing:
logger.debug(f"Skipping existing {image}")
else:
logger.debug(f"Including newly identified image {image}")
results.append(database.ImageRecord.from_path(config, image))
return results
def clean() -> List[database.ImageRecord]:
"""Identify removed or changed source images and mark them as deleted
:param config: Populated application configuration object
:returns: List of (unsaved) database models representing source images that have been deleted
or removed
"""
logger = logging.getLogger(__name__)
with database.interface.atomic():
existing = database.ImageRecord.select(database.ImageRecord.source).where(
database.ImageRecord.deleted # pylint: disable=singleton-comparison
== False
)
logger.info(f"Identified {len(existing)} existing image records")
deleted = []
for item in existing:
if item.source.exists():
logger.debug(
f"Image file exists, record will not be modified: {item.source}"
)
else:
logger.debug(f"Image file removed, record will be deleted: {item.source}")
item.deleted = True
deleted.append(item)
logger.info(f"Identified {len(deleted)} image records to be marked as deleted")
return deleted
def build(config: Optional[configuration.KodakConfig] = None) -> None:
"""Build and update the file index
:param config: Populated application configuration object
"""
logger = logging.getLogger(__name__)
config = config or configuration.load()
new_images = identify(config)
with database.interface.atomic():
database.ImageRecord.bulk_create(
new_images,
batch_size=database.calc_batch_size(config.database.backend, new_images),
)
removed_images = clean()
with database.interface.atomic():
database.ImageRecord.bulk_update(
removed_images,
fields=[database.ImageRecord.deleted],
batch_size=database.calc_batch_size(
config.database.backend, removed_images
),
)
logger.info(f"Removing source links to {len(removed_images)} removed image files")
for image in removed_images:
logger.debug(f"Removing link to removed source image {image.source}")
image.remove_link(config)
logger.info("Processing source links")
with database.interface.atomic():
for image in database.ImageRecord.select().where(
database.ImageRecord.deleted
== False # pylint: disable=singleton-comparison
):
if config.expose_source:
logger.debug(f"Creating source link to {image.source}")
image.create_link(config)
else:
logger.debug(f"Removing source link to {image.source}")
image.delete_link(config)