Skip to content

Commit ce78db6

Browse files
committed
optimize import memory consumption
1 parent 58f8ce7 commit ce78db6

5 files changed

Lines changed: 219 additions & 200 deletions

File tree

apps/logs/logic/attempt_import.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def import_new_sushi_attempts():
2121
count = queryset.count()
2222
logger.info('Found %d unprocessed successful download attempts matching criteria', count)
2323
for i, attempt in enumerate(queryset):
24-
logger.info('----- Importing attempt #%d -----', i)
24+
logger.info('----- Importing attempt #%d (pk: %d) -----', i, attempt.pk)
2525
try:
2626
import_one_sushi_attempt(attempt)
2727
except Exception as e:

apps/logs/logic/data_import.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22
from collections import Counter, namedtuple
33
from datetime import date
4-
from typing import Optional
4+
from typing import Optional, Tuple, Set
55

66
from core.logic.debug import log_memory
77
from logs.logic.validation import clean_and_validate_issn, ValidationError, normalize_isbn
@@ -244,7 +244,9 @@ def import_counter_records(
244244
to_compare[key] = (pk, value)
245245
# make the comparison
246246
log_memory('XX2')
247-
dicts_to_insert = []
247+
als_to_insert = []
248+
target_date_tuples = set()
249+
max_batch_size = 100_000
248250
for key, value in to_insert.items():
249251
db_pk, db_value = to_compare.get(key, (None, None))
250252
if db_pk:
@@ -257,21 +259,26 @@ def import_counter_records(
257259
else:
258260
rec = dict(key)
259261
rec['value'] = value
260-
dicts_to_insert.append(rec)
262+
als_to_insert.append(AccessLog(import_batch=import_batch, **rec))
263+
if rec['target_id'] is not None:
264+
target_date_tuples.add((rec['target_id'], rec['date']))
265+
if len(als_to_insert) >= max_batch_size:
266+
log_memory('Batch create')
267+
AccessLog.objects.bulk_create(als_to_insert)
268+
stats['new logs'] += len(als_to_insert)
269+
als_to_insert = []
261270
# now insert the records that are clean to be inserted
262271
log_memory('XX3')
263-
AccessLog.objects.bulk_create(
264-
[AccessLog(import_batch=import_batch, **rec) for rec in dicts_to_insert]
265-
)
266-
stats['new logs'] += len(dicts_to_insert)
272+
AccessLog.objects.bulk_create(als_to_insert)
273+
stats['new logs'] += len(als_to_insert)
267274
log_memory('XX4')
268275
# and insert the PlatformTitle links
269-
stats.update(create_platformtitle_links(organization, platform, dicts_to_insert))
276+
stats.update(create_platformtitle_links(organization, platform, target_date_tuples))
270277
log_memory('XX5')
271278
return stats
272279

273280

274-
def create_platformtitle_links(organization, platform, records: [dict]):
281+
def create_platformtitle_links(organization, platform, target_date_tuples: Set[Tuple]):
275282
"""
276283
Takes list of dicts that are used to create AccessLogs in `import_counter_records`
277284
and creates the explicit PlatformTitle objects from the data
@@ -280,10 +287,9 @@ def create_platformtitle_links(organization, platform, records: [dict]):
280287
(pt.title_id, pt.date.isoformat())
281288
for pt in PlatformTitle.objects.filter(organization=organization, platform=platform)
282289
}
283-
tuples = {(rec['target_id'], rec['date']) for rec in records if rec['target_id'] is not None}
284290
pts = []
285291
before_count = PlatformTitle.objects.count()
286-
for title_id, rec_date in tuples - existing:
292+
for title_id, rec_date in target_date_tuples - existing:
287293
pts.append(
288294
PlatformTitle(
289295
organization=organization, platform=platform, title_id=title_id, date=rec_date

0 commit comments

Comments
 (0)