diff --git a/elodie.py b/elodie.py index e516e2aa..066eff57 100755 --- a/elodie.py +++ b/elodie.py @@ -36,7 +36,7 @@ FILESYSTEM = FileSystem() -def import_file(_file, destination, album_from_folder, trash, allow_duplicates, location=None, time=None): +def import_file(_file, destination, album_from_folder, trash, allow_duplicates, location=None, time=None, media=None): _file = _decode(_file) destination = _decode(destination) @@ -54,8 +54,8 @@ def import_file(_file, destination, album_from_folder, trash, allow_duplicates, _file, destination)) return - - media = Media.get_class_by_file(_file, get_all_subclasses()) + if media is None: + media = Media.get_class_by_file(_file, get_all_subclasses()) if not media: log.warn('Not a supported file (%s)' % _file) log.all('{"source":"%s", "error_msg":"Not a supported file"}' % _file) @@ -148,24 +148,57 @@ def _import(destination, source, file, album_from_folder, trash, allow_duplicate exclude_regex_list = set(exclude_regex) - for path in paths: - path = os.path.expanduser(path) - if os.path.isdir(path): - files.update(FILESYSTEM.get_all_files(path, None, exclude_regex_list)) - else: - if not FILESYSTEM.should_exclude(path, exclude_regex_list, True): - files.add(path) - - for current_file in files: - dest_path = import_file(current_file, destination, album_from_folder, - trash, allow_duplicates, location, time) - if dest_path: - result.append((current_file, True)) - elif not allow_duplicates: - result.append((current_file, None)) # duplicate - else: - result.append((current_file, False)) # error - has_errors = has_errors is True or not dest_path + try: + for path in paths: + path = os.path.expanduser(path) + if os.path.isdir(path): + files.update(FILESYSTEM.get_all_files(path, None, exclude_regex_list)) + else: + if not FILESYSTEM.should_exclude(path, exclude_regex_list, True): + files.add(path) + + media_by_file = {} + exif_media = [] + for current_file in files: + media = Media.get_class_by_file(current_file, get_all_subclasses()) + if media: + media_by_file[current_file] = media + if isinstance(media, Media): + exif_media.append(media) + + if exif_media: + exif_tags = set() + for media in exif_media: + exif_tags.update(media.get_exiftool_batch_tags()) + + exif_metadata = ExifTool().get_tags_batch( + sorted(exif_tags), + [media.get_file_path() for media in exif_media] + ) + exif_by_file = { + metadata['SourceFile']: metadata + for metadata in exif_metadata + if 'SourceFile' in metadata + } + for media in exif_media: + media.set_exiftool_attributes( + exif_by_file.get(media.get_file_path()) + ) + + for current_file in files: + dest_path = import_file(current_file, destination, album_from_folder, + trash, allow_duplicates, location, time, + media_by_file.get(current_file)) + if dest_path: + result.append((current_file, True)) + elif not allow_duplicates: + result.append((current_file, None)) # duplicate + else: + result.append((current_file, False)) # error + has_errors = has_errors is True or not dest_path + finally: + FILESYSTEM.flush() + geolocation.flush_db() result.write() @@ -198,7 +231,7 @@ def _generate_db(source, debug): db.add_hash(db.checksum(current_file), current_file) log.progress() - db.update_hash_db() + db.flush() log.progress('', True) result.write() @@ -291,92 +324,96 @@ def _update(album, location, time, title, paths, debug, dry_run): else: files.add(path) - for current_file in files: - if not os.path.exists(current_file): - has_errors = True - result.append((current_file, False)) - log.warn('Could not find %s' % current_file) - log.all('{"source":"%s", "error_msg":"Could not find %s"}' % - (current_file, current_file)) - continue - - current_file = os.path.expanduser(current_file) - - # The destination folder structure could contain any number of levels - # So we calculate that and traverse up the tree. - # '/path/to/file/photo.jpg' -> '/path/to/file' -> - # ['path','to','file'] -> ['path','to'] -> '/path/to' - current_directory = os.path.dirname(current_file) - destination_depth = -1 * len(FILESYSTEM.get_folder_path_definition()) - destination = os.sep.join( - os.path.normpath( - current_directory - ).split(os.sep)[:destination_depth] - ) - - media = Media.get_class_by_file(current_file, get_all_subclasses()) - if not media: - continue - - updated = False - if location: - update_location(media, current_file, location) - updated = True - if time: - update_time(media, current_file, time) - updated = True - if album: - media.set_album(album) - updated = True - - # Updating a title can be problematic when doing it 2+ times on a file. - # You would end up with img_001.jpg -> img_001-first-title.jpg -> - # img_001-first-title-second-title.jpg. - # To resolve that we have to track the prior title (if there was one. - # Then we massage the updated_media's metadata['base_name'] to remove - # the old title. - # Since FileSystem.get_file_name() relies on base_name it will properly - # rename the file by updating the title instead of appending it. - remove_old_title_from_name = False - if title: - # We call get_metadata() to cache it before making any changes - metadata = media.get_metadata() - title_update_status = media.set_title(title) - original_title = metadata['title'] - if title_update_status and original_title: - # @TODO: We should move this to a shared method since - # FileSystem.get_file_name() does it too. - original_title = re.sub(r'\W+', '-', original_title.lower()) - original_base_name = metadata['base_name'] - remove_old_title_from_name = True - updated = True - - if updated: - updated_media = Media.get_class_by_file(current_file, - get_all_subclasses()) - # See comments above on why we have to do this when titles - # get updated. - if remove_old_title_from_name and len(original_title) > 0: - updated_media.get_metadata() - updated_media.set_metadata_basename( - original_base_name.replace('-%s' % original_title, '')) - - dest_path = FILESYSTEM.process_file(current_file, destination, - updated_media, move=True, allowDuplicate=True) - log.info(u'%s -> %s' % (current_file, dest_path)) - log.all('{"source":"%s", "destination":"%s"}' % (current_file, - dest_path)) - # If the folder we moved the file out of or its parent are empty - # we delete it. - FILESYSTEM.delete_directory_if_empty(os.path.dirname(current_file)) - FILESYSTEM.delete_directory_if_empty( - os.path.dirname(os.path.dirname(current_file))) - result.append((current_file, bool(dest_path))) - # Trip has_errors to False if it's already False or dest_path is. - has_errors = has_errors is True or not dest_path - else: - has_errors = False - result.append((current_file, False)) + try: + for current_file in files: + if not os.path.exists(current_file): + has_errors = True + result.append((current_file, False)) + log.warn('Could not find %s' % current_file) + log.all('{"source":"%s", "error_msg":"Could not find %s"}' % + (current_file, current_file)) + continue + + current_file = os.path.expanduser(current_file) + + # The destination folder structure could contain any number of levels + # So we calculate that and traverse up the tree. + # '/path/to/file/photo.jpg' -> '/path/to/file' -> + # ['path','to','file'] -> ['path','to'] -> '/path/to' + current_directory = os.path.dirname(current_file) + destination_depth = -1 * len(FILESYSTEM.get_folder_path_definition()) + destination = os.sep.join( + os.path.normpath( + current_directory + ).split(os.sep)[:destination_depth] + ) + + media = Media.get_class_by_file(current_file, get_all_subclasses()) + if not media: + continue + + updated = False + if location: + update_location(media, current_file, location) + updated = True + if time: + update_time(media, current_file, time) + updated = True + if album: + media.set_album(album) + updated = True + + # Updating a title can be problematic when doing it 2+ times on a file. + # You would end up with img_001.jpg -> img_001-first-title.jpg -> + # img_001-first-title-second-title.jpg. + # To resolve that we have to track the prior title (if there was one. + # Then we massage the updated_media's metadata['base_name'] to remove + # the old title. + # Since FileSystem.get_file_name() relies on base_name it will properly + # rename the file by updating the title instead of appending it. + remove_old_title_from_name = False + if title: + # We call get_metadata() to cache it before making any changes + metadata = media.get_metadata() + title_update_status = media.set_title(title) + original_title = metadata['title'] + if title_update_status and original_title: + # @TODO: We should move this to a shared method since + # FileSystem.get_file_name() does it too. + original_title = re.sub(r'\W+', '-', original_title.lower()) + original_base_name = metadata['base_name'] + remove_old_title_from_name = True + updated = True + + if updated: + updated_media = Media.get_class_by_file(current_file, + get_all_subclasses()) + # See comments above on why we have to do this when titles + # get updated. + if remove_old_title_from_name and len(original_title) > 0: + updated_media.get_metadata() + updated_media.set_metadata_basename( + original_base_name.replace('-%s' % original_title, '')) + + dest_path = FILESYSTEM.process_file(current_file, destination, + updated_media, move=True, allowDuplicate=True) + log.info(u'%s -> %s' % (current_file, dest_path)) + log.all('{"source":"%s", "destination":"%s"}' % (current_file, + dest_path)) + # If the folder we moved the file out of or its parent are empty + # we delete it. + FILESYSTEM.delete_directory_if_empty(os.path.dirname(current_file)) + FILESYSTEM.delete_directory_if_empty( + os.path.dirname(os.path.dirname(current_file))) + result.append((current_file, bool(dest_path))) + # Trip has_errors to False if it's already False or dest_path is. + has_errors = has_errors is True or not dest_path + else: + has_errors = False + result.append((current_file, False)) + finally: + FILESYSTEM.flush() + geolocation.flush_db() result.write() diff --git a/elodie/filesystem.py b/elodie/filesystem.py index 7f514101..554ee367 100644 --- a/elodie/filesystem.py +++ b/elodie/filesystem.py @@ -48,6 +48,15 @@ def __init__(self): # Instantiate a plugins object self.plugins = Plugins() + self.db = Db() + self.db_key = (constants.hash_db(), constants.location_db()) + + def _get_db(self): + db_key = (constants.hash_db(), constants.location_db()) + if self.db is None or self.db_key != db_key: + self.db = Db() + self.db_key = db_key + return self.db def _file_operation(self, operation_type, src, dst=None): """Perform file operation with dry-run support.""" @@ -507,7 +516,7 @@ def parse_mask_for_location(self, mask, location_parts, place_name): return folder_name def process_checksum(self, _file, allow_duplicate): - db = Db() + db = self._get_db() checksum = db.checksum(_file) if(checksum is None): log.info('Could not get checksum for %s.' % _file) @@ -620,9 +629,7 @@ def process_file(self, _file, destination, media, **kwargs): print(f"[DRY-RUN] Would set utime for: {_file}") print(f"[DRY-RUN] Would set utime from metadata for: {dest_path}") - db = Db() - db.add_hash(checksum, dest_path) - db.update_hash_db() + self._get_db().add_hash(checksum, dest_path) # Run `after()` for every loaded plugin and if any of them raise an exception # then we skip importing the file and log a message. @@ -634,6 +641,9 @@ def process_file(self, _file, destination, media, **kwargs): return dest_path + def flush(self): + self._get_db().flush() + def set_utime_from_metadata(self, metadata, file_path): """ Set the modification time on the file based on the file name. """ diff --git a/elodie/geolocation.py b/elodie/geolocation.py index 66112184..f248ddf1 100644 --- a/elodie/geolocation.py +++ b/elodie/geolocation.py @@ -19,17 +19,43 @@ __DEFAULT_LOCATION__ = 'Unknown Location' __PREFER_ENGLISH_NAMES__ = None __EXIFTOOL_AVAILABLE__ = None +__DB__ = None +__DB_KEY__ = None +__COORDINATE_CACHE__ = {} +__PLACE_NAME_CACHE__ = {} + + +def _get_db(): + global __DB__ + global __DB_KEY__ + db_key = (constants.hash_db(), constants.location_db()) + if __DB__ is None or __DB_KEY__ != db_key: + __DB__ = Db() + __DB_KEY__ = db_key + __COORDINATE_CACHE__.clear() + __PLACE_NAME_CACHE__.clear() + return __DB__ + + +def flush_db(): + db = _get_db() + db.flush() def coordinates_by_name(name): + if name in __COORDINATE_CACHE__: + return __COORDINATE_CACHE__[name] + # Try to get cached location first - db = Db() + db = _get_db() cached_coordinates = db.get_location_coordinates(name) if(cached_coordinates is not None): - return { + result = { 'latitude': cached_coordinates[0], 'longitude': cached_coordinates[1] } + __COORDINATE_CACHE__[name] = result + return result # Use MapQuest if key is available, otherwise use ExifTool key = get_key() @@ -62,14 +88,17 @@ def coordinates_by_name(name): use_location = location['latLng'] break - return { + result = { 'latitude': use_location['lat'], 'longitude': use_location['lng'] } + __COORDINATE_CACHE__[name] = result + return result else: # Use ExifTool as alternative when MapQuest key is not configured exiftool_result = exiftool_coordinates_by_name(name) if exiftool_result is not None: + __COORDINATE_CACHE__[name] = exiftool_result return exiftool_result return None @@ -228,13 +257,18 @@ def place_name(lat, lon): if(not isinstance(lon, float)): lon = float(lon) + cache_key = (round(lat, 4), round(lon, 4)) + if cache_key in __PLACE_NAME_CACHE__: + return __PLACE_NAME_CACHE__[cache_key] + # Try to get cached location first - db = Db() + db = _get_db() # 3km distace radious for a match cached_place_name = db.get_location_name(lat, lon, 3000) # We check that it's a dict to coerce an upgrade of the location # db from a string location to a dictionary. See gh-160. if(isinstance(cached_place_name, dict)): + __PLACE_NAME_CACHE__[cache_key] = cached_place_name return cached_place_name lookup_place_name = {} @@ -263,12 +297,11 @@ def place_name(lat, lon): if(lookup_place_name): db.add_location(lat, lon, lookup_place_name) - # TODO: Maybe this should only be done on exit and not for every write. - db.update_location_db() if('default' not in lookup_place_name): lookup_place_name = lookup_place_name_default + __PLACE_NAME_CACHE__[cache_key] = lookup_place_name return lookup_place_name diff --git a/elodie/localstorage.py b/elodie/localstorage.py index 5966c9a1..a1f7620a 100644 --- a/elodie/localstorage.py +++ b/elodie/localstorage.py @@ -9,7 +9,7 @@ import os import sys -from math import radians, cos, sqrt +from math import ceil, radians, cos, sqrt from shutil import copyfile from time import strftime @@ -58,6 +58,62 @@ def __init__(self): except ValueError: pass + self.hash_db_dirty = False + self.location_db_dirty = False + self.location_grid_size = 0.01 + self._location_name_index = {} + self._location_grid_index = {} + self._location_distance_cache = {} + self._rebuild_indexes() + + def _rebuild_indexes(self): + self._location_name_index = {} + self._location_grid_index = {} + self._location_distance_cache = {} + for data in self.location_db: + if isinstance(data['name'], str): + self._location_name_index[data['name']] = (data['lat'], data['long']) + cell = self._location_grid_key(data['lat'], data['long']) + if cell not in self._location_grid_index: + self._location_grid_index[cell] = [] + self._location_grid_index[cell].append(data) + + def _location_grid_key(self, latitude, longitude): + return ( + int(float(latitude) / self.location_grid_size), + int(float(longitude) / self.location_grid_size), + ) + + def _distance_m(self, latitude, longitude, location): + lon1, lat1, lon2, lat2 = list(map( + radians, + [longitude, latitude, location['long'], location['lat']] + )) + + r = 6371000 # radius of the earth in m + x = (lon2 - lon1) * cos(0.5 * (lat2 + lat1)) + y = lat2 - lat1 + return r * sqrt(x * x + y * y) + + def _location_candidates(self, latitude, longitude, threshold_m): + lat = float(latitude) + lon = float(longitude) + lat_radius = threshold_m / 111320.0 + lon_scale = max(abs(cos(radians(lat))), 0.1) + lon_radius = threshold_m / (111320.0 * lon_scale) + radius = max( + 1, + int(ceil(max(lat_radius, lon_radius) / self.location_grid_size)) + ) + center_lat, center_lon = self._location_grid_key(lat, lon) + candidates = [] + for lat_offset in range(-radius, radius + 1): + for lon_offset in range(-radius, radius + 1): + cell = (center_lat + lat_offset, center_lon + lon_offset) + if cell in self._location_grid_index: + candidates.extend(self._location_grid_index[cell]) + return candidates + def add_hash(self, key, value, write=False): """Add a hash to the hash db. @@ -66,6 +122,7 @@ def add_hash(self, key, value, write=False): :param bool write: If true, write the hash db to disk. """ self.hash_db[key] = value + self.hash_db_dirty = True if(write is True): self.update_hash_db() @@ -90,6 +147,14 @@ def add_location(self, latitude, longitude, place, write=False): data['long'] = longitude data['name'] = place self.location_db.append(data) + self.location_db_dirty = True + if isinstance(data['name'], str): + self._location_name_index[data['name']] = (data['lat'], data['long']) + cell = self._location_grid_key(data['lat'], data['long']) + if cell not in self._location_grid_index: + self._location_grid_index[cell] = [] + self._location_grid_index[cell].append(data) + self._location_distance_cache = {} if(write is True): self.update_location_db() @@ -148,27 +213,25 @@ def get_location_name(self, latitude, longitude, threshold_m): the given latitude and longitude. :returns: str, or None if a matching location couldn't be found. """ + cache_key = ( + round(float(latitude), 4), + round(float(longitude), 4), + int(threshold_m), + ) + if cache_key in self._location_distance_cache: + return self._location_distance_cache[cache_key] + last_d = sys.maxsize name = None - for data in self.location_db: - # As threshold is quite small use simple math - # From http://stackoverflow.com/questions/15736995/how-can-i-quickly-estimate-the-distance-between-two-latitude-longitude-points # noqa - # convert decimal degrees to radians - - lon1, lat1, lon2, lat2 = list(map( - radians, - [longitude, latitude, data['long'], data['lat']] - )) - - r = 6371000 # radius of the earth in m - x = (lon2 - lon1) * cos(0.5 * (lat2 + lat1)) - y = lat2 - lat1 - d = r * sqrt(x * x + y * y) + candidates = self._location_candidates(latitude, longitude, threshold_m) + for data in candidates: + d = self._distance_m(latitude, longitude, data) # Use if closer then threshold_km reuse lookup if(d <= threshold_m and d < last_d): name = data['name'] - last_d = d + last_d = d + self._location_distance_cache[cache_key] = name return name def get_location_coordinates(self, name): @@ -177,11 +240,7 @@ def get_location_coordinates(self, name): :param str name: Name of the location. :returns: tuple(float), or None if the location wasn't in the database. """ - for data in self.location_db: - if data['name'] == name: - return (data['lat'], data['long']) - - return None + return self._location_name_index.get(name) def all(self): """Generator to get all entries from self.hash_db @@ -193,6 +252,7 @@ def all(self): def reset_hash_db(self): self.hash_db = {} + self.hash_db_dirty = True def update_hash_db(self): """Write the hash db to disk.""" @@ -201,6 +261,7 @@ def update_hash_db(self): return with open(constants.hash_db(), 'w') as f: json.dump(self.hash_db, f) + self.hash_db_dirty = False def update_location_db(self): """Write the location db to disk.""" @@ -209,3 +270,10 @@ def update_location_db(self): return with open(constants.location_db(), 'w') as f: json.dump(self.location_db, f) + self.location_db_dirty = False + + def flush(self): + if self.hash_db_dirty: + self.update_hash_db() + if self.location_db_dirty: + self.update_location_db() diff --git a/elodie/media/media.py b/elodie/media/media.py index 788b670d..496c8601 100644 --- a/elodie/media/media.py +++ b/elodie/media/media.py @@ -132,6 +132,24 @@ def get_exiftool_attributes(self): return self.exif_metadata + def set_exiftool_attributes(self, attributes): + self.exif_metadata = attributes + + def get_exiftool_batch_tags(self): + tags = set(self.exif_map['date_taken']) + tags.update(self.camera_make_keys) + tags.update(self.camera_model_keys) + tags.update(self.album_keys) + tags.update(self.latitude_keys) + tags.update(self.longitude_keys) + tags.update([ + self.title_key, + self.latitude_ref_key, + self.longitude_ref_key, + self.original_name_key, + ]) + return list(tags) + def get_camera_make(self): """Get the camera make stored in EXIF. diff --git a/elodie/tests/elodie_test.py b/elodie/tests/elodie_test.py index 69d53973..daf1401f 100644 --- a/elodie/tests/elodie_test.py +++ b/elodie/tests/elodie_test.py @@ -293,6 +293,45 @@ def test_import_invalid_file_exit_code(): assert result.exit_code == 1, result.exit_code +def test_cli_import_persists_hash_db_on_success(): + temporary_folder, folder = helper.create_working_folder() + temporary_folder_destination, folder_destination = helper.create_working_folder() + + origin_valid = '%s/valid.jpg' % folder + shutil.copyfile(helper.get_file('plain.jpg'), origin_valid) + + runner = CliRunner() + result = runner.invoke(elodie._import, ['--destination', folder_destination, '--allow-duplicates', origin_valid]) + + db = Db() + + shutil.rmtree(folder) + shutil.rmtree(folder_destination) + + assert result.exit_code == 0, result.output + assert len(db.hash_db) == 1, db.hash_db + +def test_cli_import_flushes_successes_even_when_command_exits_with_error(): + temporary_folder, folder = helper.create_working_folder() + temporary_folder_destination, folder_destination = helper.create_working_folder() + + origin_invalid = '%s/invalid.jpg' % folder + shutil.copyfile(helper.get_file('invalid.jpg'), origin_invalid) + + origin_valid = '%s/valid.jpg' % folder + shutil.copyfile(helper.get_file('plain.jpg'), origin_valid) + + runner = CliRunner() + result = runner.invoke(elodie._import, ['--destination', folder_destination, '--allow-duplicates', origin_invalid, origin_valid]) + + db = Db() + + shutil.rmtree(folder) + shutil.rmtree(folder_destination) + + assert result.exit_code == 1, result.output + assert len(db.hash_db) == 1, db.hash_db + def test_import_file_with_single_exclude(): temporary_folder, folder = helper.create_working_folder() temporary_folder_destination, folder_destination = helper.create_working_folder() diff --git a/elodie/tests/filesystem_test.py b/elodie/tests/filesystem_test.py index c2f87ff9..8a852cca 100644 --- a/elodie/tests/filesystem_test.py +++ b/elodie/tests/filesystem_test.py @@ -15,6 +15,7 @@ from . import helper from elodie.config import load_config from elodie.filesystem import FileSystem +from elodie.localstorage import Db from elodie.media.text import Text from elodie.media.media import Media from elodie.media.photo import Photo @@ -832,6 +833,24 @@ def test_process_file_with_location_and_title(): assert origin_checksum_preprocess == origin_checksum assert helper.path_tz_fix(os.path.join('2015-12-Dec','Sunnyvale','2015-12-05_00-59-26-photo-some-title.jpg')) in destination, destination +def test_process_file_defers_hash_db_write(): + filesystem = FileSystem() + filesystem.db = mock.Mock(spec=Db) + filesystem.process_checksum = mock.Mock(return_value='checksum') + + temporary_folder, folder = helper.create_working_folder() + origin = os.path.join(folder, 'photo.jpg') + shutil.copyfile(helper.get_file('plain.jpg'), origin) + + media = Photo(origin) + destination = filesystem.process_file(origin, temporary_folder, media, allowDuplicate=True) + + filesystem.db.add_hash.assert_called_once_with('checksum', destination) + filesystem.db.update_hash_db.assert_not_called() + + shutil.rmtree(folder) + shutil.rmtree(os.path.dirname(os.path.dirname(destination))) + def test_process_file_with_album(): filesystem = FileSystem() temporary_folder, folder = helper.create_working_folder() diff --git a/elodie/tests/localstorage_test.py b/elodie/tests/localstorage_test.py index 2e584f03..c92bf2ad 100644 --- a/elodie/tests/localstorage_test.py +++ b/elodie/tests/localstorage_test.py @@ -172,6 +172,26 @@ def test_update_hash_db(): db3 = Db() assert db3.check_hash(random_key) == True +def test_flush_persists_pending_hash_and_location_updates(): + db = Db() + + random_key = helper.random_string(10) + random_value = helper.random_string(12) + latitude, longitude, name = helper.get_test_location() + + db.add_hash(random_key, random_value) + db.add_location(latitude, longitude, name) + + db2 = Db() + assert db2.check_hash(random_key) == False + assert db2.get_location_coordinates(name) is None + + db.flush() + + db3 = Db() + assert db3.check_hash(random_key) == True + assert db3.get_location_coordinates(name) == (latitude, longitude) + def test_checksum(): db = Db() @@ -232,6 +252,20 @@ def test_get_location_name_outside_threshold(): assert retrieved_name is None +def test_get_location_name_across_grid_boundary(): + db = Db() + db.location_grid_size = 0.05 + db._rebuild_indexes() + + latitude = 37.0499 + longitude = -122.0499 + name = 'Boundary Test' + db.add_location(latitude, longitude, name) + + retrieved_name = db.get_location_name(37.0501, -122.0501, 100) + + assert retrieved_name == name + def test_get_location_coordinates_exists(): db = Db() diff --git a/scripts/benchmark_db_cache.py b/scripts/benchmark_db_cache.py new file mode 100644 index 00000000..22704e68 --- /dev/null +++ b/scripts/benchmark_db_cache.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + +import argparse +import json +import os +import sys +import tempfile +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from elodie.localstorage import Db + + +def benchmark_hash_writes(entries): + tmpdir = tempfile.mkdtemp(prefix='elodie-bench-current-') + os.environ['ELODIE_APPLICATION_DIRECTORY'] = tmpdir + start = time.perf_counter() + for i in range(entries): + db = Db() + db.add_hash(f'k{i}', f'v{i}') + db.update_hash_db() + current = time.perf_counter() - start + + tmpdir = tempfile.mkdtemp(prefix='elodie-bench-batched-') + os.environ['ELODIE_APPLICATION_DIRECTORY'] = tmpdir + start = time.perf_counter() + db = Db() + for i in range(entries): + db.add_hash(f'k{i}', f'v{i}') + db.flush() + batched = time.perf_counter() - start + + return { + 'benchmark': 'hash_writes', + 'entries': entries, + 'current_seconds': round(current, 4), + 'batched_seconds': round(batched, 4), + 'speedup_x': round(current / batched, 2) if batched else None, + } + + +def benchmark_location_lookup(entries, lookups=500): + tmpdir = tempfile.mkdtemp(prefix='elodie-bench-location-') + os.environ['ELODIE_APPLICATION_DIRECTORY'] = tmpdir + db = Db() + for i in range(entries): + db.add_location(37.0 + i * 0.0001, -122.0 - i * 0.0001, f'p{i}') + + coords = [ + (37.0 + ((i * 17) % entries) * 0.0001, -122.0 - ((i * 17) % entries) * 0.0001) + for i in range(lookups) + ] + + start = time.perf_counter() + for lat, lon in coords: + best_name = None + best_distance = None + for data in db.location_db: + distance = db._distance_m(lat, lon, data) + if distance <= 3000 and (best_distance is None or distance < best_distance): + best_name = data['name'] + best_distance = distance + linear = time.perf_counter() - start + + start = time.perf_counter() + for lat, lon in coords: + db.get_location_name(lat, lon, 3000) + indexed = time.perf_counter() - start + + return { + 'benchmark': 'location_lookup', + 'entries': entries, + 'lookups': lookups, + 'linear_seconds': round(linear, 4), + 'indexed_seconds': round(indexed, 4), + 'speedup_x': round(linear / indexed, 2) if indexed else None, + } + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--entries', + type=int, + nargs='+', + default=[100, 1000, 3000], + help='Number of hash entries to benchmark.', + ) + parser.add_argument( + '--lookups', + type=int, + default=500, + help='Number of location lookups to benchmark.', + ) + args = parser.parse_args() + + for entries in args.entries: + print(json.dumps(benchmark_hash_writes(entries))) + print(json.dumps(benchmark_location_lookup(entries, args.lookups))) + + +if __name__ == '__main__': + main() diff --git a/scripts/profile_import_flow.py b/scripts/profile_import_flow.py new file mode 100644 index 00000000..241011a9 --- /dev/null +++ b/scripts/profile_import_flow.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python + +import importlib.util +import json +import os +import shutil +import sys +import tempfile +import time +from contextlib import ExitStack +from unittest import mock + +from click.testing import CliRunner + +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.insert(0, REPO_ROOT) +sys.path.insert(0, os.path.join(REPO_ROOT, 'elodie', 'tests')) + +import helper + +from elodie import constants +from elodie.dependencies import get_exiftool +from elodie.external.pyexiftool import ExifTool +from elodie.media.base import Base + + +def load_elodie_module(): + elodie_path = os.path.join(REPO_ROOT, 'elodie.py') + spec = importlib.util.spec_from_file_location('elodie_profile', elodie_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def timed_wrapper(stats, key, func): + def wrapped(*args, **kwargs): + start = time.perf_counter() + try: + return func(*args, **kwargs) + finally: + stats[key]['count'] += 1 + stats[key]['seconds'] += time.perf_counter() - start + return wrapped + + +def main(): + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--files', type=int, default=50, help='Number of files to import.') + parser.add_argument( + '--fixture', + default='plain.jpg', + help='Fixture file from elodie/tests/files to duplicate.', + ) + args = parser.parse_args() + + stats = { + 'get_metadata': {'count': 0, 'seconds': 0.0}, + 'get_metadata_batch': {'count': 0, 'seconds': 0.0}, + 'get_tags_batch': {'count': 0, 'seconds': 0.0}, + 'process_checksum': {'count': 0, 'seconds': 0.0}, + 'place_name': {'count': 0, 'seconds': 0.0}, + 'file_operation': {'count': 0, 'seconds': 0.0}, + 'flush': {'count': 0, 'seconds': 0.0}, + } + + src_root = tempfile.mkdtemp(prefix='elodie-profile-src-') + dst_root = tempfile.mkdtemp(prefix='elodie-profile-dst-') + app_root = tempfile.mkdtemp(prefix='elodie-profile-app-') + os.environ['ELODIE_APPLICATION_DIRECTORY'] = app_root + + elodie = load_elodie_module() + + exiftool_addedargs = [u'-config', u'"{}"'.format(constants.exiftool_config)] + exiftool = ExifTool(executable_=get_exiftool(), addedargs=exiftool_addedargs) + exiftool.start() + + paths = [] + for i in range(args.files): + file_path = os.path.join(src_root, f'fixture-{i}{os.path.splitext(args.fixture)[1]}') + shutil.copyfile(helper.get_file(args.fixture), file_path) + paths.append(file_path) + + with ExitStack() as stack: + stack.enter_context(mock.patch( + 'elodie.media.base.Base.get_metadata', + timed_wrapper(stats, 'get_metadata', Base.get_metadata) + )) + stack.enter_context(mock.patch( + 'elodie.external.pyexiftool.ExifTool.get_metadata_batch', + timed_wrapper(stats, 'get_metadata_batch', ExifTool.get_metadata_batch) + )) + stack.enter_context(mock.patch( + 'elodie.external.pyexiftool.ExifTool.get_tags_batch', + timed_wrapper(stats, 'get_tags_batch', ExifTool.get_tags_batch) + )) + stack.enter_context(mock.patch( + 'elodie.filesystem.FileSystem.process_checksum', + timed_wrapper(stats, 'process_checksum', elodie.FileSystem.process_checksum) + )) + stack.enter_context(mock.patch( + 'elodie.geolocation.place_name', + timed_wrapper(stats, 'place_name', elodie.geolocation.place_name) + )) + stack.enter_context(mock.patch( + 'elodie.filesystem.FileSystem._file_operation', + timed_wrapper(stats, 'file_operation', elodie.FileSystem._file_operation) + )) + stack.enter_context(mock.patch( + 'elodie.filesystem.FileSystem.flush', + timed_wrapper(stats, 'flush', elodie.FileSystem.flush) + )) + + runner = CliRunner() + start = time.perf_counter() + result = runner.invoke(elodie._import, ['--destination', dst_root, '--allow-duplicates', *paths]) + total = time.perf_counter() - start + + try: + exiftool.terminate() + except Exception: + pass + + output = { + 'files': args.files, + 'fixture': args.fixture, + 'total_seconds': round(total, 4), + 'exit_code': result.exit_code, + 'timings': { + key: { + 'count': value['count'], + 'seconds': round(value['seconds'], 4), + 'share_percent': round((value['seconds'] / total) * 100, 2) if total else 0.0, + } + for key, value in stats.items() + }, + } + print(json.dumps(output)) + + shutil.rmtree(src_root) + shutil.rmtree(dst_root) + shutil.rmtree(app_root) + + +if __name__ == '__main__': + main()