mirror of
https://github.com/enpaul/kodak.git
synced 2024-11-23 15:07:13 +00:00
Add index generation tooling for detecting source images
This commit is contained in:
parent
56e683a94a
commit
caa05e6e32
@ -1,6 +1,7 @@
|
||||
import flask_restful
|
||||
|
||||
from kodak import resources
|
||||
from kodak import tools
|
||||
from kodak._server import initialize_database
|
||||
from kodak._server import KodakFlask
|
||||
from kodak._server import make_the_tea
|
||||
@ -12,6 +13,7 @@ API = flask_restful.Api(APPLICATION, catch_all_404s=True)
|
||||
|
||||
APPLICATION.before_request(make_the_tea)
|
||||
APPLICATION.before_first_request(initialize_database)
|
||||
APPLICATION.before_first_request(tools.index.build)
|
||||
|
||||
for resource in resources.RESOURCES:
|
||||
API.add_resource(resource, *resource.routes)
|
||||
|
@ -1,4 +1,7 @@
|
||||
import enum
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import Set
|
||||
|
||||
import peewee
|
||||
|
||||
@ -34,11 +37,11 @@ class CropAnchor(enum.Enum):
|
||||
class ImageFormat(enum.Enum):
|
||||
"""Supported image conversion formats"""
|
||||
|
||||
JPEG = enum.auto()
|
||||
PNG = enum.auto()
|
||||
JPEG = ("jpg", "jpeg")
|
||||
PNG = ("png",)
|
||||
|
||||
|
||||
DEFAULT_SQLITE_PRAGMAS = {
|
||||
DEFAULT_SQLITE_PRAGMAS: Dict[str, Any] = {
|
||||
"journal_mode": "wal",
|
||||
"cache_size": -1 * 64000,
|
||||
"foreign_keys": 1,
|
||||
@ -46,4 +49,14 @@ DEFAULT_SQLITE_PRAGMAS = {
|
||||
"synchronous": 0,
|
||||
}
|
||||
|
||||
DEFAULT_SUPPORTED_FORMATS = {ImageFormat.JPEG, ImageFormat.PNG}
|
||||
SQLITE_VARIABLE_LIMIT = 999
|
||||
|
||||
DEFAULT_SUPPORTED_FORMATS: Set[ImageFormat] = {ImageFormat.JPEG, ImageFormat.PNG}
|
||||
|
||||
IMAGE_PATH_NAME_SEPARATOR: str = "-"
|
||||
|
||||
IMAGE_FILE_EXTENSIONS: Set[str] = set()
|
||||
|
||||
for item in ImageFormat:
|
||||
for ext in item.value:
|
||||
IMAGE_FILE_EXTENSIONS.add(f".{ext}")
|
||||
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from typing import Sequence
|
||||
from typing import Tuple
|
||||
from typing import Type
|
||||
|
||||
@ -17,6 +18,51 @@ from kodak.database.image import ImageRecord
|
||||
MODELS: Tuple[Type[KodakModel], ...] = (ImageRecord, AliasRecord, AccessRecord)
|
||||
|
||||
|
||||
def calc_batch_size(
|
||||
backend: constants.DatabaseBackend, models: Sequence[KodakModel]
|
||||
) -> int:
|
||||
"""Determine the batch size that should be used when performing queries
|
||||
|
||||
This is intended to work around the query variable limit in SQLite. Critically this is a
|
||||
limit to the number of _variables_, not _records_ that can be referenced in a single query.
|
||||
|
||||
The "correct" way to calculate this is to iterate over the model list and tally the number of
|
||||
changed fields, then add one for the table name, and each time you reach the
|
||||
``SQLITE_VARIABLE_LIMIT`` (which is a known constant) cut a new batch until all the models are
|
||||
processed. This is very complicated because peewee doesn't provide a simple way to reliably
|
||||
identify changed fields.
|
||||
|
||||
The naive way to calculate this (i.e. the way this function does it) is to determine the
|
||||
maximum number of variables that _could be_ used to modify a record and use that as the
|
||||
constant batch limiter. The theoretical maximum number of variables associated with a single
|
||||
record is equal to the number of fields on that record, plus 1 (for the table name). This gives
|
||||
the batch size (i.e. number of records that can be modified in a single query) as:
|
||||
|
||||
::
|
||||
|
||||
999 / (len(fields) + 1)
|
||||
|
||||
Where ``fields`` is an array of the fields that could be written on the record.
|
||||
|
||||
.. note:: This function (pretty safely) assumes that all the records in ``models`` are of the
|
||||
same model type; i.e. they all relate to the same table. This is a pretty safe
|
||||
assumption since there's no way to do multi-table updates in a single query while
|
||||
using sane SQL practices.
|
||||
|
||||
.. note:: This function just returns ``len(models)`` if the backend is anything other than
|
||||
``SQLITE``. This is because the limitation this works around is only applicable to
|
||||
SQLite so on other platforms we can just make the batch size as large as possible.
|
||||
|
||||
:param backend: Backend being used by the application
|
||||
:param models: Sequence of models that need to be batched
|
||||
:returns: Number of models that can be processed in a single batch
|
||||
"""
|
||||
# oof, the ratio of lines-of-docstring to lines-of-code in this function is 35:3
|
||||
if models and backend == constants.DatabaseBackend.SQLITE:
|
||||
return int(constants.SQLITE_VARIABLE_LIMIT / (len(models[0].fields) + 1))
|
||||
return len(models)
|
||||
|
||||
|
||||
def initialize(config: KodakConfig):
|
||||
"""Initialize the database interface
|
||||
|
||||
|
@ -2,6 +2,8 @@ import datetime
|
||||
import enum
|
||||
import typing
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
from typing import NamedTuple
|
||||
from typing import Type
|
||||
|
||||
@ -48,7 +50,7 @@ class EnumField(peewee.CharField):
|
||||
raise peewee.IntegrityError(
|
||||
f"Enum {self.enumeration.__name__} has no value '{value}'"
|
||||
)
|
||||
return value.name
|
||||
return super().db_value(value.name)
|
||||
|
||||
def python_value(self, value: str) -> enum.Enum:
|
||||
"""Convert the stored string to the corresponding enum
|
||||
@ -59,13 +61,25 @@ class EnumField(peewee.CharField):
|
||||
:returns: The enum item with the name passed to ``value``
|
||||
"""
|
||||
try:
|
||||
return self.enumeration[value]
|
||||
return self.enumeration[super().python_value(value)]
|
||||
except KeyError:
|
||||
raise peewee.InterfaceError(
|
||||
f"Enum {self.enumeration.__name__} has no value with name '{value}'"
|
||||
) from None
|
||||
|
||||
|
||||
class PathField(peewee.CharField):
|
||||
"""Field for storing paths in the database"""
|
||||
|
||||
def db_value(self, value: Path) -> str:
|
||||
"""Serialize a pathlib object to a database string"""
|
||||
return super().db_value(str(value))
|
||||
|
||||
def python_value(self, value: str) -> Path:
|
||||
"""Serialize a database string to a pathlib object"""
|
||||
return Path(super().python_value(value))
|
||||
|
||||
|
||||
class ChecksumField(peewee.CharField):
|
||||
"""Field for storing checksum hashes in the database
|
||||
|
||||
@ -79,11 +93,11 @@ class ChecksumField(peewee.CharField):
|
||||
|
||||
def db_value(self, value: Checksum) -> str:
|
||||
"""Serialize the checkstum to a database string"""
|
||||
return f"{value.algorithm}:{value.digest}"
|
||||
return super().db_value(f"{value.algorithm}:{value.digest}")
|
||||
|
||||
def python_value(self, value: str) -> Checksum:
|
||||
"""Deserailize a string to a checksum container"""
|
||||
alg, _, digest = value.partition(":")
|
||||
alg, _, digest = super().python_value(value).partition(":")
|
||||
return Checksum(algorithm=alg, digest=digest)
|
||||
|
||||
|
||||
@ -95,3 +109,9 @@ class KodakModel(peewee.Model):
|
||||
|
||||
uuid = peewee.UUIDField(null=False, unique=True, default=uuid.uuid4)
|
||||
created = peewee.DateTimeField(null=False, default=datetime.datetime.utcnow)
|
||||
|
||||
@classmethod
|
||||
@property
|
||||
def fields(cls) -> Dict[str, peewee.Field]:
|
||||
"""Expose the peewee field metadata as a public object"""
|
||||
return cls._meta.fields # pylint: disable=protected-access
|
||||
|
@ -1,15 +1,80 @@
|
||||
import hashlib
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import peewee
|
||||
|
||||
from kodak import configuration
|
||||
from kodak import constants
|
||||
from kodak.database._shared import Checksum
|
||||
from kodak.database._shared import ChecksumField
|
||||
from kodak.database._shared import EnumField
|
||||
from kodak.database._shared import KodakModel
|
||||
from kodak.database._shared import PathField
|
||||
|
||||
|
||||
class ImageRecord(KodakModel):
|
||||
"""Model for source images"""
|
||||
|
||||
name = peewee.Charfield(null=False)
|
||||
format = EnumField(constants.ImageFormat, null=False)
|
||||
name = peewee.CharField(null=False)
|
||||
source = PathField(null=False)
|
||||
format_ = EnumField(constants.ImageFormat, null=False)
|
||||
deleted = peewee.BooleanField(null=False, default=False)
|
||||
checksum = ChecksumField(null=False)
|
||||
|
||||
@classmethod
|
||||
def from_path(cls, config: configuration.KodakConfig, path: Path):
|
||||
"""Construct an image record from a path
|
||||
|
||||
:param config: Populated application configuration object
|
||||
:param path: Full path to the image file to process. The file path provided is expected to
|
||||
already be absolute, with all symlinks and aliases resolved.
|
||||
|
||||
.. note:: This method attempts to _efficiently_ compute a hash of large image files. The
|
||||
hashing code was adapted from here:
|
||||
|
||||
https://stackoverflow.com/a/44873382/5361209
|
||||
"""
|
||||
hasher = hashlib.sha256()
|
||||
view = memoryview(bytearray(1024 * 1024))
|
||||
with path.open("rb", buffering=0) as infile:
|
||||
for chunk in iter(lambda: infile.readinto(view), 0):
|
||||
hasher.update(view[:chunk])
|
||||
|
||||
name = path.stem
|
||||
extension = path.suffix
|
||||
|
||||
for item in constants.ImageFormat:
|
||||
if extension.lower()[1:] in item.value:
|
||||
format_ = item
|
||||
break
|
||||
else:
|
||||
raise RuntimeError
|
||||
|
||||
name = name.replace(str(config.source_dir), "").replace(
|
||||
os.sep, constants.IMAGE_PATH_NAME_SEPARATOR
|
||||
)
|
||||
|
||||
return cls(
|
||||
name=name, source=path, format_=format_, checksum=Checksum.from_hash(hasher)
|
||||
)
|
||||
|
||||
def create_link(self, config: configuration.KodakConfig) -> Path:
|
||||
"""Creates a link between the content directory and source directory
|
||||
|
||||
:param config: Populated application configuration object
|
||||
:returns: Path to the created symbolic link back to the source file
|
||||
"""
|
||||
link = Path(config.content_dir, self.name)
|
||||
try:
|
||||
link.symlink_to(self.source)
|
||||
except FileExistsError:
|
||||
pass
|
||||
return link
|
||||
|
||||
def remove_link(self, config: configuration.KodakConfig) -> None:
|
||||
"""Remove a link between the content and source directory
|
||||
|
||||
:param config: Populated application configuration object
|
||||
"""
|
||||
Path(config.content_dir, self.name).unlink(missing_ok=True)
|
||||
|
1
kodak/tools/__init__.py
Normal file
1
kodak/tools/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from kodak.tools import index
|
138
kodak/tools/index.py
Normal file
138
kodak/tools/index.py
Normal file
@ -0,0 +1,138 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
||||
from kodak import configuration
|
||||
from kodak import constants
|
||||
from kodak import database
|
||||
|
||||
|
||||
def identify(config: configuration.KodakConfig) -> List[database.ImageRecord]:
|
||||
"""Identify source images that will be made available
|
||||
|
||||
:param config: Populated application configuration object
|
||||
:returns: List of (unsaved) database models representing identified source image files
|
||||
"""
|
||||
|
||||
def _identify(path: Path) -> List[Path]:
|
||||
identified = []
|
||||
for item in path.iterdir():
|
||||
if item.is_file() and item.suffix in constants.IMAGE_FILE_EXTENSIONS:
|
||||
logger.debug(f"Including file {item}")
|
||||
identified.append(item)
|
||||
elif item.is_dir():
|
||||
logger.debug(f"Entering subdirectory {item}")
|
||||
identified += _identify(item)
|
||||
else:
|
||||
logger.debug(f"Skipping {item}")
|
||||
return identified
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.info(
|
||||
f"Identifying image files with extensions {', '.join(constants.IMAGE_FILE_EXTENSIONS)} under {config.source_dir}"
|
||||
)
|
||||
|
||||
images = _identify(config.source_dir)
|
||||
|
||||
logger.info(f"Identified {len(images)} files under {config.source_dir}")
|
||||
|
||||
with database.interface.atomic():
|
||||
existing = [
|
||||
item.source
|
||||
for item in database.ImageRecord.select(database.ImageRecord.source)
|
||||
]
|
||||
|
||||
logger.debug(f"Fetched {len(existing)} existing image records")
|
||||
|
||||
results = []
|
||||
for image in images:
|
||||
if image in existing:
|
||||
logger.debug(f"Skipping existing {image}")
|
||||
else:
|
||||
logger.debug(f"Including newly identified image {image}")
|
||||
results.append(database.ImageRecord.from_path(config, image))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def clean() -> List[database.ImageRecord]:
|
||||
"""Identify removed or changed source images and mark them as deleted
|
||||
|
||||
:param config: Populated application configuration object
|
||||
:returns: List of (unsaved) database models representing source images that have been deleted
|
||||
or removed
|
||||
"""
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
with database.interface.atomic():
|
||||
existing = database.ImageRecord.select(database.ImageRecord.source).where(
|
||||
database.ImageRecord.deleted # pylint: disable=singleton-comparison
|
||||
== False
|
||||
)
|
||||
|
||||
logger.info(f"Identified {len(existing)} existing image records")
|
||||
|
||||
deleted = []
|
||||
for item in existing:
|
||||
if item.source.exists():
|
||||
logger.debug(
|
||||
f"Image file exists, record will not be modified: {item.source}"
|
||||
)
|
||||
else:
|
||||
logger.debug(f"Image file removed, record will be deleted: {item.source}")
|
||||
item.deleted = True
|
||||
deleted.append(item)
|
||||
|
||||
logger.info(f"Identified {len(deleted)} image records to be marked as deleted")
|
||||
|
||||
return deleted
|
||||
|
||||
|
||||
def build(config: Optional[configuration.KodakConfig] = None) -> None:
|
||||
"""Build and update the file index
|
||||
|
||||
:param config: Populated application configuration object
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
config = config or configuration.load()
|
||||
|
||||
new_images = identify(config)
|
||||
with database.interface.atomic():
|
||||
database.ImageRecord.bulk_create(
|
||||
new_images,
|
||||
batch_size=database.calc_batch_size(config.database.backend, new_images),
|
||||
)
|
||||
|
||||
removed_images = clean()
|
||||
with database.interface.atomic():
|
||||
database.ImageRecord.bulk_update(
|
||||
removed_images,
|
||||
fields=[database.ImageRecord.deleted],
|
||||
batch_size=database.calc_batch_size(
|
||||
config.database.backend, removed_images
|
||||
),
|
||||
)
|
||||
|
||||
logger.info(f"Removing source links to {len(removed_images)} removed image files")
|
||||
|
||||
for image in removed_images:
|
||||
logger.debug(f"Removing link to removed source image {image.source}")
|
||||
image.remove_link(config)
|
||||
|
||||
logger.info("Processing source links")
|
||||
|
||||
with database.interface.atomic():
|
||||
for image in database.ImageRecord.select().where(
|
||||
database.ImageRecord.deleted
|
||||
== False # pylint: disable=singleton-comparison
|
||||
):
|
||||
if config.expose_source:
|
||||
logger.debug(f"Creating source link to {image.source}")
|
||||
image.create_link(config)
|
||||
else:
|
||||
logger.debug(f"Removing source link to {image.source}")
|
||||
image.delete_link(config)
|
Loading…
Reference in New Issue
Block a user