mirror of
https://github.com/enpaul/kodak.git
synced 2024-11-23 15:07:13 +00:00
Add index generation tooling for detecting source images
This commit is contained in:
parent
56e683a94a
commit
caa05e6e32
@ -1,6 +1,7 @@
|
|||||||
import flask_restful
|
import flask_restful
|
||||||
|
|
||||||
from kodak import resources
|
from kodak import resources
|
||||||
|
from kodak import tools
|
||||||
from kodak._server import initialize_database
|
from kodak._server import initialize_database
|
||||||
from kodak._server import KodakFlask
|
from kodak._server import KodakFlask
|
||||||
from kodak._server import make_the_tea
|
from kodak._server import make_the_tea
|
||||||
@ -12,6 +13,7 @@ API = flask_restful.Api(APPLICATION, catch_all_404s=True)
|
|||||||
|
|
||||||
APPLICATION.before_request(make_the_tea)
|
APPLICATION.before_request(make_the_tea)
|
||||||
APPLICATION.before_first_request(initialize_database)
|
APPLICATION.before_first_request(initialize_database)
|
||||||
|
APPLICATION.before_first_request(tools.index.build)
|
||||||
|
|
||||||
for resource in resources.RESOURCES:
|
for resource in resources.RESOURCES:
|
||||||
API.add_resource(resource, *resource.routes)
|
API.add_resource(resource, *resource.routes)
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
import enum
|
import enum
|
||||||
|
from typing import Any
|
||||||
|
from typing import Dict
|
||||||
|
from typing import Set
|
||||||
|
|
||||||
import peewee
|
import peewee
|
||||||
|
|
||||||
@ -34,11 +37,11 @@ class CropAnchor(enum.Enum):
|
|||||||
class ImageFormat(enum.Enum):
|
class ImageFormat(enum.Enum):
|
||||||
"""Supported image conversion formats"""
|
"""Supported image conversion formats"""
|
||||||
|
|
||||||
JPEG = enum.auto()
|
JPEG = ("jpg", "jpeg")
|
||||||
PNG = enum.auto()
|
PNG = ("png",)
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_SQLITE_PRAGMAS = {
|
DEFAULT_SQLITE_PRAGMAS: Dict[str, Any] = {
|
||||||
"journal_mode": "wal",
|
"journal_mode": "wal",
|
||||||
"cache_size": -1 * 64000,
|
"cache_size": -1 * 64000,
|
||||||
"foreign_keys": 1,
|
"foreign_keys": 1,
|
||||||
@ -46,4 +49,14 @@ DEFAULT_SQLITE_PRAGMAS = {
|
|||||||
"synchronous": 0,
|
"synchronous": 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_SUPPORTED_FORMATS = {ImageFormat.JPEG, ImageFormat.PNG}
|
SQLITE_VARIABLE_LIMIT = 999
|
||||||
|
|
||||||
|
DEFAULT_SUPPORTED_FORMATS: Set[ImageFormat] = {ImageFormat.JPEG, ImageFormat.PNG}
|
||||||
|
|
||||||
|
IMAGE_PATH_NAME_SEPARATOR: str = "-"
|
||||||
|
|
||||||
|
IMAGE_FILE_EXTENSIONS: Set[str] = set()
|
||||||
|
|
||||||
|
for item in ImageFormat:
|
||||||
|
for ext in item.value:
|
||||||
|
IMAGE_FILE_EXTENSIONS.add(f".{ext}")
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import Sequence
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
from typing import Type
|
from typing import Type
|
||||||
|
|
||||||
@ -17,6 +18,51 @@ from kodak.database.image import ImageRecord
|
|||||||
MODELS: Tuple[Type[KodakModel], ...] = (ImageRecord, AliasRecord, AccessRecord)
|
MODELS: Tuple[Type[KodakModel], ...] = (ImageRecord, AliasRecord, AccessRecord)
|
||||||
|
|
||||||
|
|
||||||
|
def calc_batch_size(
|
||||||
|
backend: constants.DatabaseBackend, models: Sequence[KodakModel]
|
||||||
|
) -> int:
|
||||||
|
"""Determine the batch size that should be used when performing queries
|
||||||
|
|
||||||
|
This is intended to work around the query variable limit in SQLite. Critically this is a
|
||||||
|
limit to the number of _variables_, not _records_ that can be referenced in a single query.
|
||||||
|
|
||||||
|
The "correct" way to calculate this is to iterate over the model list and tally the number of
|
||||||
|
changed fields, then add one for the table name, and each time you reach the
|
||||||
|
``SQLITE_VARIABLE_LIMIT`` (which is a known constant) cut a new batch until all the models are
|
||||||
|
processed. This is very complicated because peewee doesn't provide a simple way to reliably
|
||||||
|
identify changed fields.
|
||||||
|
|
||||||
|
The naive way to calculate this (i.e. the way this function does it) is to determine the
|
||||||
|
maximum number of variables that _could be_ used to modify a record and use that as the
|
||||||
|
constant batch limiter. The theoretical maximum number of variables associated with a single
|
||||||
|
record is equal to the number of fields on that record, plus 1 (for the table name). This gives
|
||||||
|
the batch size (i.e. number of records that can be modified in a single query) as:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
999 / (len(fields) + 1)
|
||||||
|
|
||||||
|
Where ``fields`` is an array of the fields that could be written on the record.
|
||||||
|
|
||||||
|
.. note:: This function (pretty safely) assumes that all the records in ``models`` are of the
|
||||||
|
same model type; i.e. they all relate to the same table. This is a pretty safe
|
||||||
|
assumption since there's no way to do multi-table updates in a single query while
|
||||||
|
using sane SQL practices.
|
||||||
|
|
||||||
|
.. note:: This function just returns ``len(models)`` if the backend is anything other than
|
||||||
|
``SQLITE``. This is because the limitation this works around is only applicable to
|
||||||
|
SQLite so on other platforms we can just make the batch size as large as possible.
|
||||||
|
|
||||||
|
:param backend: Backend being used by the application
|
||||||
|
:param models: Sequence of models that need to be batched
|
||||||
|
:returns: Number of models that can be processed in a single batch
|
||||||
|
"""
|
||||||
|
# oof, the ratio of lines-of-docstring to lines-of-code in this function is 35:3
|
||||||
|
if models and backend == constants.DatabaseBackend.SQLITE:
|
||||||
|
return int(constants.SQLITE_VARIABLE_LIMIT / (len(models[0].fields) + 1))
|
||||||
|
return len(models)
|
||||||
|
|
||||||
|
|
||||||
def initialize(config: KodakConfig):
|
def initialize(config: KodakConfig):
|
||||||
"""Initialize the database interface
|
"""Initialize the database interface
|
||||||
|
|
||||||
|
@ -2,6 +2,8 @@ import datetime
|
|||||||
import enum
|
import enum
|
||||||
import typing
|
import typing
|
||||||
import uuid
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict
|
||||||
from typing import NamedTuple
|
from typing import NamedTuple
|
||||||
from typing import Type
|
from typing import Type
|
||||||
|
|
||||||
@ -48,7 +50,7 @@ class EnumField(peewee.CharField):
|
|||||||
raise peewee.IntegrityError(
|
raise peewee.IntegrityError(
|
||||||
f"Enum {self.enumeration.__name__} has no value '{value}'"
|
f"Enum {self.enumeration.__name__} has no value '{value}'"
|
||||||
)
|
)
|
||||||
return value.name
|
return super().db_value(value.name)
|
||||||
|
|
||||||
def python_value(self, value: str) -> enum.Enum:
|
def python_value(self, value: str) -> enum.Enum:
|
||||||
"""Convert the stored string to the corresponding enum
|
"""Convert the stored string to the corresponding enum
|
||||||
@ -59,13 +61,25 @@ class EnumField(peewee.CharField):
|
|||||||
:returns: The enum item with the name passed to ``value``
|
:returns: The enum item with the name passed to ``value``
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return self.enumeration[value]
|
return self.enumeration[super().python_value(value)]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise peewee.InterfaceError(
|
raise peewee.InterfaceError(
|
||||||
f"Enum {self.enumeration.__name__} has no value with name '{value}'"
|
f"Enum {self.enumeration.__name__} has no value with name '{value}'"
|
||||||
) from None
|
) from None
|
||||||
|
|
||||||
|
|
||||||
|
class PathField(peewee.CharField):
|
||||||
|
"""Field for storing paths in the database"""
|
||||||
|
|
||||||
|
def db_value(self, value: Path) -> str:
|
||||||
|
"""Serialize a pathlib object to a database string"""
|
||||||
|
return super().db_value(str(value))
|
||||||
|
|
||||||
|
def python_value(self, value: str) -> Path:
|
||||||
|
"""Serialize a database string to a pathlib object"""
|
||||||
|
return Path(super().python_value(value))
|
||||||
|
|
||||||
|
|
||||||
class ChecksumField(peewee.CharField):
|
class ChecksumField(peewee.CharField):
|
||||||
"""Field for storing checksum hashes in the database
|
"""Field for storing checksum hashes in the database
|
||||||
|
|
||||||
@ -79,11 +93,11 @@ class ChecksumField(peewee.CharField):
|
|||||||
|
|
||||||
def db_value(self, value: Checksum) -> str:
|
def db_value(self, value: Checksum) -> str:
|
||||||
"""Serialize the checkstum to a database string"""
|
"""Serialize the checkstum to a database string"""
|
||||||
return f"{value.algorithm}:{value.digest}"
|
return super().db_value(f"{value.algorithm}:{value.digest}")
|
||||||
|
|
||||||
def python_value(self, value: str) -> Checksum:
|
def python_value(self, value: str) -> Checksum:
|
||||||
"""Deserailize a string to a checksum container"""
|
"""Deserailize a string to a checksum container"""
|
||||||
alg, _, digest = value.partition(":")
|
alg, _, digest = super().python_value(value).partition(":")
|
||||||
return Checksum(algorithm=alg, digest=digest)
|
return Checksum(algorithm=alg, digest=digest)
|
||||||
|
|
||||||
|
|
||||||
@ -95,3 +109,9 @@ class KodakModel(peewee.Model):
|
|||||||
|
|
||||||
uuid = peewee.UUIDField(null=False, unique=True, default=uuid.uuid4)
|
uuid = peewee.UUIDField(null=False, unique=True, default=uuid.uuid4)
|
||||||
created = peewee.DateTimeField(null=False, default=datetime.datetime.utcnow)
|
created = peewee.DateTimeField(null=False, default=datetime.datetime.utcnow)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@property
|
||||||
|
def fields(cls) -> Dict[str, peewee.Field]:
|
||||||
|
"""Expose the peewee field metadata as a public object"""
|
||||||
|
return cls._meta.fields # pylint: disable=protected-access
|
||||||
|
@ -1,15 +1,80 @@
|
|||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import peewee
|
import peewee
|
||||||
|
|
||||||
|
from kodak import configuration
|
||||||
from kodak import constants
|
from kodak import constants
|
||||||
|
from kodak.database._shared import Checksum
|
||||||
from kodak.database._shared import ChecksumField
|
from kodak.database._shared import ChecksumField
|
||||||
from kodak.database._shared import EnumField
|
from kodak.database._shared import EnumField
|
||||||
from kodak.database._shared import KodakModel
|
from kodak.database._shared import KodakModel
|
||||||
|
from kodak.database._shared import PathField
|
||||||
|
|
||||||
|
|
||||||
class ImageRecord(KodakModel):
|
class ImageRecord(KodakModel):
|
||||||
"""Model for source images"""
|
"""Model for source images"""
|
||||||
|
|
||||||
name = peewee.Charfield(null=False)
|
name = peewee.CharField(null=False)
|
||||||
format = EnumField(constants.ImageFormat, null=False)
|
source = PathField(null=False)
|
||||||
|
format_ = EnumField(constants.ImageFormat, null=False)
|
||||||
deleted = peewee.BooleanField(null=False, default=False)
|
deleted = peewee.BooleanField(null=False, default=False)
|
||||||
checksum = ChecksumField(null=False)
|
checksum = ChecksumField(null=False)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_path(cls, config: configuration.KodakConfig, path: Path):
|
||||||
|
"""Construct an image record from a path
|
||||||
|
|
||||||
|
:param config: Populated application configuration object
|
||||||
|
:param path: Full path to the image file to process. The file path provided is expected to
|
||||||
|
already be absolute, with all symlinks and aliases resolved.
|
||||||
|
|
||||||
|
.. note:: This method attempts to _efficiently_ compute a hash of large image files. The
|
||||||
|
hashing code was adapted from here:
|
||||||
|
|
||||||
|
https://stackoverflow.com/a/44873382/5361209
|
||||||
|
"""
|
||||||
|
hasher = hashlib.sha256()
|
||||||
|
view = memoryview(bytearray(1024 * 1024))
|
||||||
|
with path.open("rb", buffering=0) as infile:
|
||||||
|
for chunk in iter(lambda: infile.readinto(view), 0):
|
||||||
|
hasher.update(view[:chunk])
|
||||||
|
|
||||||
|
name = path.stem
|
||||||
|
extension = path.suffix
|
||||||
|
|
||||||
|
for item in constants.ImageFormat:
|
||||||
|
if extension.lower()[1:] in item.value:
|
||||||
|
format_ = item
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise RuntimeError
|
||||||
|
|
||||||
|
name = name.replace(str(config.source_dir), "").replace(
|
||||||
|
os.sep, constants.IMAGE_PATH_NAME_SEPARATOR
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
name=name, source=path, format_=format_, checksum=Checksum.from_hash(hasher)
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_link(self, config: configuration.KodakConfig) -> Path:
|
||||||
|
"""Creates a link between the content directory and source directory
|
||||||
|
|
||||||
|
:param config: Populated application configuration object
|
||||||
|
:returns: Path to the created symbolic link back to the source file
|
||||||
|
"""
|
||||||
|
link = Path(config.content_dir, self.name)
|
||||||
|
try:
|
||||||
|
link.symlink_to(self.source)
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
return link
|
||||||
|
|
||||||
|
def remove_link(self, config: configuration.KodakConfig) -> None:
|
||||||
|
"""Remove a link between the content and source directory
|
||||||
|
|
||||||
|
:param config: Populated application configuration object
|
||||||
|
"""
|
||||||
|
Path(config.content_dir, self.name).unlink(missing_ok=True)
|
||||||
|
1
kodak/tools/__init__.py
Normal file
1
kodak/tools/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from kodak.tools import index
|
138
kodak/tools/index.py
Normal file
138
kodak/tools/index.py
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from kodak import configuration
|
||||||
|
from kodak import constants
|
||||||
|
from kodak import database
|
||||||
|
|
||||||
|
|
||||||
|
def identify(config: configuration.KodakConfig) -> List[database.ImageRecord]:
|
||||||
|
"""Identify source images that will be made available
|
||||||
|
|
||||||
|
:param config: Populated application configuration object
|
||||||
|
:returns: List of (unsaved) database models representing identified source image files
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _identify(path: Path) -> List[Path]:
|
||||||
|
identified = []
|
||||||
|
for item in path.iterdir():
|
||||||
|
if item.is_file() and item.suffix in constants.IMAGE_FILE_EXTENSIONS:
|
||||||
|
logger.debug(f"Including file {item}")
|
||||||
|
identified.append(item)
|
||||||
|
elif item.is_dir():
|
||||||
|
logger.debug(f"Entering subdirectory {item}")
|
||||||
|
identified += _identify(item)
|
||||||
|
else:
|
||||||
|
logger.debug(f"Skipping {item}")
|
||||||
|
return identified
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Identifying image files with extensions {', '.join(constants.IMAGE_FILE_EXTENSIONS)} under {config.source_dir}"
|
||||||
|
)
|
||||||
|
|
||||||
|
images = _identify(config.source_dir)
|
||||||
|
|
||||||
|
logger.info(f"Identified {len(images)} files under {config.source_dir}")
|
||||||
|
|
||||||
|
with database.interface.atomic():
|
||||||
|
existing = [
|
||||||
|
item.source
|
||||||
|
for item in database.ImageRecord.select(database.ImageRecord.source)
|
||||||
|
]
|
||||||
|
|
||||||
|
logger.debug(f"Fetched {len(existing)} existing image records")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for image in images:
|
||||||
|
if image in existing:
|
||||||
|
logger.debug(f"Skipping existing {image}")
|
||||||
|
else:
|
||||||
|
logger.debug(f"Including newly identified image {image}")
|
||||||
|
results.append(database.ImageRecord.from_path(config, image))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def clean() -> List[database.ImageRecord]:
|
||||||
|
"""Identify removed or changed source images and mark them as deleted
|
||||||
|
|
||||||
|
:param config: Populated application configuration object
|
||||||
|
:returns: List of (unsaved) database models representing source images that have been deleted
|
||||||
|
or removed
|
||||||
|
"""
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
with database.interface.atomic():
|
||||||
|
existing = database.ImageRecord.select(database.ImageRecord.source).where(
|
||||||
|
database.ImageRecord.deleted # pylint: disable=singleton-comparison
|
||||||
|
== False
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Identified {len(existing)} existing image records")
|
||||||
|
|
||||||
|
deleted = []
|
||||||
|
for item in existing:
|
||||||
|
if item.source.exists():
|
||||||
|
logger.debug(
|
||||||
|
f"Image file exists, record will not be modified: {item.source}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(f"Image file removed, record will be deleted: {item.source}")
|
||||||
|
item.deleted = True
|
||||||
|
deleted.append(item)
|
||||||
|
|
||||||
|
logger.info(f"Identified {len(deleted)} image records to be marked as deleted")
|
||||||
|
|
||||||
|
return deleted
|
||||||
|
|
||||||
|
|
||||||
|
def build(config: Optional[configuration.KodakConfig] = None) -> None:
|
||||||
|
"""Build and update the file index
|
||||||
|
|
||||||
|
:param config: Populated application configuration object
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
config = config or configuration.load()
|
||||||
|
|
||||||
|
new_images = identify(config)
|
||||||
|
with database.interface.atomic():
|
||||||
|
database.ImageRecord.bulk_create(
|
||||||
|
new_images,
|
||||||
|
batch_size=database.calc_batch_size(config.database.backend, new_images),
|
||||||
|
)
|
||||||
|
|
||||||
|
removed_images = clean()
|
||||||
|
with database.interface.atomic():
|
||||||
|
database.ImageRecord.bulk_update(
|
||||||
|
removed_images,
|
||||||
|
fields=[database.ImageRecord.deleted],
|
||||||
|
batch_size=database.calc_batch_size(
|
||||||
|
config.database.backend, removed_images
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Removing source links to {len(removed_images)} removed image files")
|
||||||
|
|
||||||
|
for image in removed_images:
|
||||||
|
logger.debug(f"Removing link to removed source image {image.source}")
|
||||||
|
image.remove_link(config)
|
||||||
|
|
||||||
|
logger.info("Processing source links")
|
||||||
|
|
||||||
|
with database.interface.atomic():
|
||||||
|
for image in database.ImageRecord.select().where(
|
||||||
|
database.ImageRecord.deleted
|
||||||
|
== False # pylint: disable=singleton-comparison
|
||||||
|
):
|
||||||
|
if config.expose_source:
|
||||||
|
logger.debug(f"Creating source link to {image.source}")
|
||||||
|
image.create_link(config)
|
||||||
|
else:
|
||||||
|
logger.debug(f"Removing source link to {image.source}")
|
||||||
|
image.delete_link(config)
|
Loading…
Reference in New Issue
Block a user