Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Include/internal/pycore_dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ extern Py_ssize_t _Py_dict_lookup_threadsafe_stackref(PyDictObject *mp, PyObject

extern int _PyDict_GetMethodStackRef(PyDictObject *dict, PyObject *name, _PyStackRef *method);

extern Py_ssize_t _PyDict_LookupIndexAndValue(PyDictObject *, PyObject *, PyObject **);
extern Py_ssize_t _PyDict_LookupIndex(PyDictObject *, PyObject *);
extern Py_ssize_t _PyDictKeys_StringLookup(PyDictKeysObject* dictkeys, PyObject *key);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
If we are specializing to ``LOAD_GLOBAL_MODULE`` or ``LOAD_ATTR_MODULE``, try
to enable deferred reference counting for the value, if the object is owned by
a different thread. This applies to the free-threaded build only and should
improve scaling of multi-threaded programs. Note that when deferred reference
counting is enabled, the object will be deallocated by the GC, rather than by
:c:func:`Py_DECREF`.
125 changes: 12 additions & 113 deletions Modules/_testinternalcapi/interpreter.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@

int Test_EvalFrame_Resumes, Test_EvalFrame_Loads;

#ifdef _Py_TIER2
static int
stop_tracing_and_jit(PyThreadState *tstate, _PyInterpreterFrame *frame)
{
(void)(tstate);
(void)(frame);
return 0;
}
#endif

_PyJitEntryFuncPtr _Py_jit_entry;

#if _Py_TAIL_CALL_INTERP
#include "test_targets.h"
#include "test_cases.c.h"
Expand Down Expand Up @@ -78,12 +90,6 @@ Test_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag)
frame->previous = &entry.frame;
tstate->current_frame = frame;
entry.frame.localsplus[0] = PyStackRef_NULL;
#ifdef _Py_TIER2
if (tstate->current_executor != NULL) {
entry.frame.localsplus[0] = PyStackRef_FromPyObjectNew(tstate->current_executor);
tstate->current_executor = NULL;
}
#endif

/* support for generator.throw() */
if (throwflag) {
Expand Down Expand Up @@ -119,11 +125,6 @@ Test_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag)
#endif
}

#if defined(_Py_TIER2) && !defined(_Py_JIT)
/* Tier 2 interpreter state */
_PyExecutorObject *current_executor = NULL;
const _PyUOpInstruction *next_uop = NULL;
#endif
#if _Py_TAIL_CALL_INTERP
# if Py_STATS
return _TAIL_CALL_start_frame(frame, NULL, tstate, NULL, instruction_funcptr_handler_table, 0, lastopcode);
Expand All @@ -136,108 +137,6 @@ Test_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag)
#endif


#ifdef _Py_TIER2

// Tier 2 is also here!
enter_tier_two:

#ifdef _Py_JIT
assert(0);
#else

#undef LOAD_IP
#define LOAD_IP(UNUSED) (void)0

#ifdef Py_STATS
// Disable these macros that apply to Tier 1 stats when we are in Tier 2
#undef STAT_INC
#define STAT_INC(opname, name) ((void)0)
#undef STAT_DEC
#define STAT_DEC(opname, name) ((void)0)
#endif

#undef ENABLE_SPECIALIZATION
#define ENABLE_SPECIALIZATION 0
#undef ENABLE_SPECIALIZATION_FT
#define ENABLE_SPECIALIZATION_FT 0

; // dummy statement after a label, before a declaration
uint16_t uopcode;
#ifdef Py_STATS
int lastuop = 0;
uint64_t trace_uop_execution_counter = 0;
#endif

assert(next_uop->opcode == _START_EXECUTOR);
tier2_dispatch:
for (;;) {
uopcode = next_uop->opcode;
#ifdef Py_DEBUG
if (frame->lltrace >= 3) {
dump_stack(frame, stack_pointer);
if (next_uop->opcode == _START_EXECUTOR) {
printf("%4d uop: ", 0);
}
else {
printf("%4d uop: ", (int)(next_uop - current_executor->trace));
}
_PyUOpPrint(next_uop);
printf("\n");
}
#endif
next_uop++;
OPT_STAT_INC(uops_executed);
UOP_STAT_INC(uopcode, execution_count);
UOP_PAIR_INC(uopcode, lastuop);
#ifdef Py_STATS
trace_uop_execution_counter++;
((_PyUOpInstruction *)next_uop)[-1].execution_count++;
#endif

switch (uopcode) {

#include "executor_cases.c.h"

default:
#ifdef Py_DEBUG
{
printf("Unknown uop: ");
_PyUOpPrint(&next_uop[-1]);
printf(" @ %d\n", (int)(next_uop - current_executor->trace - 1));
Py_FatalError("Unknown uop");
}
#else
Py_UNREACHABLE();
#endif

}
}

jump_to_error_target:
#ifdef Py_DEBUG
if (frame->lltrace >= 2) {
printf("Error: [UOp ");
_PyUOpPrint(&next_uop[-1]);
printf(" @ %d -> %s]\n",
(int)(next_uop - current_executor->trace - 1),
_PyOpcode_OpName[frame->instr_ptr->op.code]);
}
#endif
assert(next_uop[-1].format == UOP_FORMAT_JUMP);
uint16_t target = uop_get_error_target(&next_uop[-1]);
next_uop = current_executor->trace + target;
goto tier2_dispatch;

jump_to_jump_target:
assert(next_uop[-1].format == UOP_FORMAT_JUMP);
target = uop_get_jump_target(&next_uop[-1]);
next_uop = current_executor->trace + target;
goto tier2_dispatch;

#endif // _Py_JIT

#endif // _Py_TIER2

early_exit:
assert(_PyErr_Occurred(tstate));
_Py_LeaveRecursiveCallPy(tstate);
Expand Down
12 changes: 9 additions & 3 deletions Objects/dictobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -2349,10 +2349,9 @@ dict_unhashable_type(PyObject *key)
}

Py_ssize_t
_PyDict_LookupIndex(PyDictObject *mp, PyObject *key)
_PyDict_LookupIndexAndValue(PyDictObject *mp, PyObject *key, PyObject **value)
{
// TODO: Thread safety
PyObject *value;
assert(PyDict_CheckExact((PyObject*)mp));
assert(PyUnicode_CheckExact(key));

Expand All @@ -2362,7 +2361,14 @@ _PyDict_LookupIndex(PyDictObject *mp, PyObject *key)
return -1;
}

return _Py_dict_lookup(mp, key, hash, &value);
return _Py_dict_lookup(mp, key, hash, value);
}

Py_ssize_t
_PyDict_LookupIndex(PyDictObject *mp, PyObject *key)
{
PyObject *value; // discarded
return _PyDict_LookupIndexAndValue(mp, key, &value);
}

/* Same as PyDict_GetItemWithError() but with hash supplied by caller.
Expand Down
30 changes: 28 additions & 2 deletions Python/specialize.c
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,21 @@ static int function_kind(PyCodeObject *code);
static bool function_check_args(PyObject *o, int expected_argcount, int opcode);
static uint32_t function_get_version(PyObject *o, int opcode);

#ifdef Py_GIL_DISABLED
static void
maybe_enable_deferred_ref_count(PyObject *op)
{
if (!_Py_IsOwnedByCurrentThread(op)) {
// For module level variables that are heavily used from multiple
// threads, deferred reference counting provides good scaling
// benefits. The downside is that the object will only be deallocated
// by a GC run.
PyUnstable_Object_EnableDeferredRefcount(op);
}
}
#endif


static int
specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, PyObject *name)
{
Expand All @@ -366,7 +381,8 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P
SPECIALIZATION_FAIL(LOAD_ATTR, SPEC_FAIL_ATTR_NON_STRING);
return -1;
}
Py_ssize_t index = _PyDict_LookupIndex(dict, name);
PyObject *value;
Py_ssize_t index = _PyDict_LookupIndexAndValue(dict, name, &value);
assert(index != DKIX_ERROR);
if (index != (uint16_t)index) {
SPECIALIZATION_FAIL(LOAD_ATTR,
Expand All @@ -381,6 +397,9 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P
SPECIALIZATION_FAIL(LOAD_ATTR, SPEC_FAIL_OUT_OF_VERSIONS);
return -1;
}
#ifdef Py_GIL_DISABLED
maybe_enable_deferred_ref_count(value);
#endif
write_u32(cache->version, keys_version);
cache->index = (uint16_t)index;
specialize(instr, LOAD_ATTR_MODULE);
Expand Down Expand Up @@ -1269,7 +1288,6 @@ specialize_attr_loadclassattr(PyObject *owner, _Py_CODEUNIT *instr,
return 1;
}


static void
specialize_load_global_lock_held(
PyObject *globals, PyObject *builtins,
Expand All @@ -1289,7 +1307,12 @@ specialize_load_global_lock_held(
SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_LOAD_GLOBAL_NON_STRING_OR_SPLIT);
goto fail;
}
#ifdef Py_GIL_DISABLED
PyObject *value;
Py_ssize_t index = _PyDict_LookupIndexAndValue((PyDictObject *)globals, name, &value);
#else
Py_ssize_t index = _PyDictKeys_StringLookup(globals_keys, name);
#endif
if (index == DKIX_ERROR) {
SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_EXPECTED_ERROR);
goto fail;
Expand All @@ -1310,6 +1333,9 @@ specialize_load_global_lock_held(
SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_OUT_OF_RANGE);
goto fail;
}
#ifdef Py_GIL_DISABLED
maybe_enable_deferred_ref_count(value);
#endif
cache->index = (uint16_t)index;
cache->module_keys_version = (uint16_t)keys_version;
specialize(instr, LOAD_GLOBAL_MODULE);
Expand Down
1 change: 1 addition & 0 deletions Tools/c-analyzer/cpython/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def format_tsv_lines(lines):
'Python/opcode_targets.h',
'Modules/_testinternalcapi/test_targets.h',
'Modules/_testinternalcapi/test_cases.c.h',
'Modules/_testinternalcapi/interpreter.c',
# XXX: Throws errors if PY_VERSION_HEX is not mocked out
'Modules/clinic/_testclinic_depr.c.h',

Expand Down
9 changes: 9 additions & 0 deletions Tools/ftscalingbench/ftscalingbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# > echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost
#

import copy
import math
import os
import queue
Expand Down Expand Up @@ -214,6 +215,14 @@ def instantiate_dataclass():
for _ in range(1000 * WORK_SCALE):
obj = MyDataClass(x=1, y=2, z=3)


@register_benchmark
def deepcopy():
x = {'list': [1, 2], 'tuple': (1, None)}
for i in range(40 * WORK_SCALE):
copy.deepcopy(x)


def bench_one_thread(func):
t0 = time.perf_counter_ns()
func()
Expand Down
Loading