diff --git a/examples/python/call-external.py b/examples/python/call-external.py index 6694daf5e..ab4fdb2f5 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -3,6 +3,8 @@ from loopy.diagnostic import LoopyError from loopy.target.c import CTarget from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 +from loopy.target.c.c_execution import CCompiler +from codepy.toolchain import GCCToolchain # {{{ blas callable @@ -22,7 +24,7 @@ def with_types(self, arg_id_to_dtype, callables_table): if vec_dtype.numpy_dtype == np.float32: name_in_target = "cblas_sgemv" - elif vec_dtype. numpy_dtype == np.float64: + elif vec_dtype.numpy_dtype == np.float64: name_in_target = "cblas_dgemv" else: raise LoopyError("GEMV is only supported for float32 and float64 " @@ -47,30 +49,37 @@ def with_descrs(self, arg_id_to_descr, callables_table): assert mat_descr.shape[0] == res_descr.shape[0] assert len(vec_descr.shape) == len(res_descr.shape) == 1 # handling only the easy case when stride == 1 - assert vec_descr.dim_tags[0].stride == 1 assert mat_descr.dim_tags[1].stride == 1 - assert res_descr.dim_tags[0].stride == 1 return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table def emit_call_insn(self, insn, target, expression_to_code_mapper): from pymbolic import var + from loopy.codegen import UnvectorizableError mat_descr = self.arg_id_to_descr[0] + vec_descr = self.arg_id_to_descr[1] + res_descr = self.arg_id_to_descr[-1] m, n = mat_descr.shape ecm = expression_to_code_mapper + + if ecm.codegen_state.vectorization_info is not None: + raise UnvectorizableError("cannot vectorize BLAS-gemv.") + mat, vec = insn.expression.parameters result, = insn.assignees c_parameters = [var("CblasRowMajor"), var("CblasNoTrans"), m, n, - 1, + 1, # alpha ecm(mat).expr, - 1, + mat_descr.dim_tags[0].stride, # LDA ecm(vec).expr, - 1, + vec_descr.dim_tags[0].stride, # INCX + 0, # beta ecm(result).expr, - 1] + res_descr.dim_tags[0].stride # INCY + ] return (var(self.name_in_target)(*c_parameters), False # cblas_gemv does not return anything ) @@ -83,17 +92,95 @@ def generate_preambles(self, target): # }}} -n = 10 - -knl = lp.make_kernel( - "{:}", +def transform_1(knl): + return knl + + +def transform_2(knl): + # A similar transformation is applied to kernels containing + # SLATE + # callables. + knl = lp.split_iname(knl, "e", 4, inner_iname="e_inner", slabs=(0, 1)) + knl = lp.privatize_temporaries_with_inames(knl, "e_inner") + knl = lp.tag_inames(knl, {"e_inner": "vec"}) + if 0: + # Easy codegen exercise, but misses vectorizing certain instructions. + knl = lp.tag_array_axes(knl, "tmp3", "c,vec") + else: + knl = lp.tag_array_axes(knl, "tmp3,tmp2", "c,vec") + return knl + + +def main(): + + compiler = CCompiler(toolchain=GCCToolchain( + cc="gcc", + cflags="-std=c99 -O3 -fPIC".split(), + ldflags="-shared".split(), + libraries=["blas"], + library_dirs=[], + defines=[], + undefines=[], + source_suffix="c", + so_ext=".so", + o_ext=".o", + include_dirs=[])) + + knl = lp.make_kernel( + "{[e,i1,i2]: 0<=e tmp3[i2] = 2 * tmp2[i2] + out[e, i2] = tmp3[i2] + end + end + """, + kernel_data=[ + lp.TemporaryVariable("tmp1", + shape=(4, ), + dtype=None), + lp.TemporaryVariable("tmp2", + shape=(4, ), + dtype=None), + lp.GlobalArg("A", + shape=(4, 4), + dtype="float64"), + lp.GlobalArg("x", + shape=lp.auto, + dtype="float64"), + ...], + target=lp.ExecutableCVectorExtensionsTarget(compiler=compiler), + lang_version=(2018, 2)) + + knl = lp.register_callable(knl, "matvec", CBLASGEMV("matvec")) + + for transform_func in [transform_1, transform_2]: + knl = transform_func(knl) + print("Generated code from '{transform_func.__name__} -----'") + print(lp.generate_code_v2(knl).device_code()) + print(75 * "-") + + # {{ verify the result is correct. + + from numpy.random import default_rng + + rng = default_rng(seed=0) + a = rng.random((4, 4)) + x = rng.random((100, 4)) + + _, (out,) = knl(A=a, x=x) + + np.testing.assert_allclose(6*np.einsum("ij,ej->ei", + a, x), + out) + + # }}} + + +if __name__ == "__main__": + main() diff --git a/loopy/__init__.py b/loopy/__init__.py index ce3ba1439..e577e0270 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -43,7 +43,8 @@ AddressSpace, TemporaryVariable, SubstitutionRule, - CallMangleInfo) + CallMangleInfo, + VectorizeTag) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.translation_unit import ( @@ -150,10 +151,12 @@ from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran, parse_fortran) -from loopy.target import TargetBase, ASTBuilderBase +from loopy.target import TargetBase, ASTBuilderBase, VectorizationFallback from loopy.target.c import (CFamilyTarget, CTarget, ExecutableCTarget, generate_header, CWithGNULibcTarget, ExecutableCWithGNULibcTarget) +from loopy.target.c_vector_extensions import (CVectorExtensionsTarget, + ExecutableCVectorExtensionsTarget) from loopy.target.cuda import CudaTarget from loopy.target.opencl import OpenCLTarget from loopy.target.pyopencl import PyOpenCLTarget @@ -190,7 +193,7 @@ "AddressSpace", "TemporaryVariable", "SubstitutionRule", - "CallMangleInfo", + "CallMangleInfo", "VectorizeTag", "make_kernel", "UniqueName", "make_function", @@ -298,9 +301,10 @@ "LoopyError", "LoopyWarning", - "TargetBase", + "TargetBase", "VectorizationFallback", "CFamilyTarget", "CTarget", "ExecutableCTarget", "generate_header", "CWithGNULibcTarget", "ExecutableCWithGNULibcTarget", + "CVectorExtensionsTarget", "ExecutableCVectorExtensionsTarget", "CudaTarget", "OpenCLTarget", "PyOpenCLTarget", "ISPCTarget", "ASTBuilderBase", diff --git a/loopy/check.py b/loopy/check.py index 730e77c0e..3a2c9248c 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -605,13 +605,15 @@ def check_for_data_dependent_parallel_bounds(kernel): Check that inames tagged as hw axes have bounds that are known at kernel launch. """ - from loopy.kernel.data import ConcurrentTag + from loopy.kernel.data import LocalInameTagBase, GroupInameTag for i, dom in enumerate(kernel.domains): dom_inames = set(dom.get_var_names(dim_type.set)) - par_inames = { - iname for iname in dom_inames - if kernel.iname_tags_of_type(iname, ConcurrentTag)} + # do not check for vec-inames as their implementation is accompanied + # with a fallback machinery + par_inames = {iname for iname in dom_inames + if kernel.iname_tags_of_type(iname, (LocalInameTagBase, + GroupInameTag))} if not par_inames: continue diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 199679099..0edd3cdeb 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -281,23 +281,47 @@ def try_vectorized(self, what, func): return self.unvectorize(func) def unvectorize(self, func): + from loopy.codegen.result import (merge_codegen_results, + CodeGenerationResult) + from loopy.target import VectorizationFallback + vinf = self.vectorization_info assert vinf is not None result = [] novec_self = self.copy(vectorization_info=None) - for i in range(vinf.length): - idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i - new_codegen_state = novec_self.fix(vinf.iname, idx_aff) - generated = func(new_codegen_state) - - if isinstance(generated, list): - result.extend(generated) + if self.target.vectorization_fallback == VectorizationFallback.UNROLL: + for i in range(vinf.length): + idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i + new_codegen_state = novec_self.fix(vinf.iname, idx_aff) + generated = func(new_codegen_state) + + if isinstance(generated, list): + result.extend(generated) + else: + result.append(generated) + elif self.target.vectorization_fallback == VectorizationFallback.OMP_SIMD: + astb = self.ast_builder + inner = func(novec_self) + if isinstance(inner, list): + inner = merge_codegen_results(novec_self, inner) + assert isinstance(inner, CodeGenerationResult) + if isinstance(inner.current_ast(novec_self), + astb.ast_comment_class): + # loop body is a comment => do not emit the loop + loop_cgr = inner else: - result.append(generated) + result.append(astb.emit_pragma("omp simd")) + loop_cgr = inner.with_new_ast( + novec_self, + astb.emit_sequential_loop( + novec_self, vinf.iname, self.kernel.index_dtype, + 0, vinf.length-1, inner.current_ast(novec_self))) + result.append(loop_cgr) + else: + raise NotImplementedError(self.target.vectorization_fallback) - from loopy.codegen.result import merge_codegen_results return merge_codegen_results(self, result) @property diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 713254075..383988863 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -127,6 +127,13 @@ def generate_assignment_instruction_code(codegen_state, insn): raise UnvectorizableError( "LHS is scalar, RHS is vector, cannot assign") + if (lhs_is_vector + and (not rhs_is_vector) + and (not + kernel.target.broadcasts_scalar_assignment_to_vec_types)): + raise UnvectorizableError( + "LHS is vector, RHS is not vector, cannot assign") + is_vector = lhs_is_vector del lhs_is_vector diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index a0d22330f..a2ea89c9b 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -160,10 +160,25 @@ def generate_unroll_loop(codegen_state, sched_index): # {{{ vectorized loops +def raise_for_unvectorizable_loop(codegen_state, sched_index): + kernel = codegen_state.kernel + raise RuntimeError(f"Cannot vectorize {kernel.schedule[sched_index]}") + + def generate_vectorize_loop(codegen_state, sched_index): + from loopy.kernel.data import VectorizeTag + from loopy.target import VectorizationFallback kernel = codegen_state.kernel iname = kernel.linearization[sched_index].iname + vec_tag, = kernel.inames[iname].tags_of_type(VectorizeTag) + + if kernel.target.vectorization_fallback == VectorizationFallback.UNROLL: + fallback_codegen_routine = generate_unroll_loop + elif kernel.target.vectorization_fallback == VectorizationFallback.OMP_SIMD: + fallback_codegen_routine = generate_openmp_simd_loop + else: + raise NotImplementedError(kernel.target.vectorization_fallback) bounds = kernel.get_iname_bounds(iname, constants_only=True) @@ -177,7 +192,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return fallback_codegen_routine(codegen_state, sched_index) length = int(pw_aff_to_expr(length_aff)) @@ -192,7 +207,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return fallback_codegen_routine(codegen_state, sched_index) # {{{ 'implement' vectorization bounds @@ -484,4 +499,17 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): # }}} + +# {{{ omp simd loop + +def generate_openmp_simd_loop(codegen_state, sched_index): + return merge_codegen_results( + codegen_state, + [codegen_state.ast_builder.emit_pragma("omp simd"), + generate_sequential_loop_dim_code(codegen_state, + sched_index)]) + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/expression.py b/loopy/expression.py index fda3a1499..8da4d403e 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -76,14 +76,13 @@ def combine(vectorizabilities): return reduce(and_, vectorizabilities) def map_sum(self, expr): - return any(self.rec(child) for child in expr.children) + return any([self.rec(child) for child in expr.children]) map_product = map_sum def map_quotient(self, expr): - return (self.rec(expr.numerator) - or - self.rec(expr.denominator)) + return any([self.rec(expr.numerator), + self.rec(expr.denominator)]) def map_linear_subscript(self, expr): return False @@ -176,6 +175,15 @@ def map_reduction(self, expr): # FIXME: Do this more carefully raise UnvectorizableError() + def map_if(self, expr): + # TODO: For OpenCL-target this should be possible, see + # https://www.khronos.org/registry/OpenCL/specs/2.2/html/OpenCL_C.html#operators-ternary-selection + raise UnvectorizableError("Emitting vector instructions with masks not" + " (yet) supported.") + + def map_type_cast(self, expr): + raise UnvectorizableError("Type casting on vector types not supported.") + # }}} # vim: fdm=marker diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index dd182211b..c169a8b73 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1298,7 +1298,9 @@ def eval_expr_assert_integer_constant(i, expr): # We'll do absolutely nothing here, which will result # in the vector being returned. pass - + elif (vectorization_info is None + and kernel.target.allows_non_constant_indexing_for_vec_types): + vector_index = eval_expr(idx) else: idx = eval_expr_assert_integer_constant(i, idx) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index eb373a12d..9c055bc02 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -189,6 +189,18 @@ def map_call_with_kwargs(self, expr): raise NotImplementedError +def _get_stride(dim_tag): + from loopy.kernel.array import (FixedStrideArrayDimTag, + VectorArrayDimTag) + if isinstance(dim_tag, FixedStrideArrayDimTag): + return dim_tag.stride + elif isinstance(dim_tag, VectorArrayDimTag): + # loopy pushes vec axis to the unit stride dim. + return 1 + else: + raise NotImplementedError(type(dim_tag)) + + def get_arg_descriptor_for_expression(kernel, expr): """ :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor` @@ -222,7 +234,7 @@ def get_arg_descriptor_for_expression(kernel, expr): from loopy.symbolic import simplify_using_aff linearized_index = simplify_using_aff( kernel, - sum(dim_tag.stride*iname for dim_tag, iname in + sum(_get_stride(dim_tag)*iname for dim_tag, iname in zip(arg.dim_tags, expr.subscript.index_tuple))) strides_as_dict = SweptInameStrideCollector( diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 5bb1043b9..e52dacb16 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -10,6 +10,7 @@ .. autoclass:: OpenCLTarget .. autoclass:: PyOpenCLTarget .. autoclass:: ISPCTarget +.. autoclass:: VectorizationFallback References to Canonical Names ----------------------------- @@ -48,7 +49,9 @@ from typing import (Any, Tuple, Generic, TypeVar, Sequence, ClassVar, Optional, - TYPE_CHECKING) + TYPE_CHECKING, Type) +import abc +from enum import Enum, unique if TYPE_CHECKING: from loopy.typing import ExpressionT @@ -59,11 +62,40 @@ ASTType = TypeVar("ASTType") -class TargetBase(): +@unique +class VectorizationFallback(Enum): + """ + Directs :mod:`loopy`\'s code-generation pipeline how the code should be + generated if an instruction cannot be vectorized. + + :attr UNROLL: Unrolls the instances the unvectorizable statement. + :attr UNROLL: Wraps the statement around a loop with an ``omp simd`` + pragma-directive. + """ + UNROLL = 0 + OMP_SIMD = 1 + + +class TargetBase(abc.ABC): """Base class for all targets, i.e. different combinations of code that loopy can generate. Objects of this type must be picklable. + + .. attribute:: vectorization_fallback + + An instance of :class:`VectorizationFallback`. + + .. attribute:: allows_non_constant_indexing_for_vec_types + + An instance of :class:`bool` that is *True* only if the target + allows vector-typed variables to be indexed via a non-constant + expression. + + .. attribute:: broadcasts_scalar_assignment_to_vec_types + + An instance of :class:`bool` that is *True* only if the target + allows vector-typed variables to be assigned to scalar expressions. """ # {{{ persistent hashing @@ -159,6 +191,25 @@ def get_kernel_executor(self, kernel, *args, **kwargs): """ raise NotImplementedError() + @abc.abstractproperty + def is_executable(self) -> bool: + """ + Returns *True* only if the target allows executing loopy + translation units through :attr:`loopy.TranslationUnit.__call__`. + """ + + @abc.abstractproperty + def vectorization_fallback(self): + pass + + @abc.abstractproperty + def allows_non_constant_indexing_for_vec_types(self): + pass + + @abc.abstractproperty + def broadcasts_scalar_assignment_to_vec_types(self): + pass + class ASTBuilderBase(Generic[ASTType]): """An interface for generating (host or device) ASTs. @@ -229,6 +280,12 @@ def ast_block_class(self): def ast_block_scope_class(self): raise NotImplementedError() + @abc.abstractproperty + def ast_comment_class(self) -> Type[ASTType]: + """ + Returns the type of a comment node in the AST being built. + """ + def get_expression_to_code_mapper(self, codegen_state: CodeGenerationState): raise NotImplementedError() diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index de6a32a68..724f6e9e5 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -841,7 +841,9 @@ def get_function_declaration( # subkernel launches occur only as part of entrypoint kernels for now from loopy.schedule.tools import get_subkernel_arg_info skai = get_subkernel_arg_info(kernel, subkernel_name) - passed_names = skai.passed_names + passed_names = (skai.passed_names + if self.target.is_executable + else [arg.name for arg in kernel.args]) written_names = skai.written_names else: name = Value("static void", name) @@ -952,6 +954,11 @@ def ast_block_class(self): from cgen import Block return Block + @property + def ast_comment_class(self): + from cgen import Comment + return Comment + @property def ast_block_scope_class(self): return ScopingBlock @@ -1256,6 +1263,10 @@ def emit_comment(self, s): from cgen import Comment return Comment(s) + def emit_pragma(self, s): + from cgen import Pragma + return Pragma(s) + @property def can_implement_conditionals(self): return True @@ -1333,6 +1344,23 @@ def get_dtype_registry(self): fill_registry_with_c99_complex_types(result) return DTypeRegistryWrapper(result) + @property + def is_executable(self) -> bool: + return False + + @property + def allows_non_constant_indexing_for_vec_types(self): + return False + + @property + def broadcasts_scalar_assignment_to_vec_types(self): + return False + + @property + def vectorization_fallback(self): + from loopy.target import VectorizationFallback + return VectorizationFallback.UNROLL + class CASTBuilder(CFamilyASTBuilder): def preamble_generators(self): @@ -1376,6 +1404,10 @@ def get_host_ast_builder(self): # enable host code generation return CFamilyASTBuilder(self) + @property + def is_executable(self) -> bool: + return True + # }}} diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 496c75e58..097645677 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -217,15 +217,21 @@ def make_var(name): ary = self.find_array(expr) from loopy.kernel.array import get_access_info - from pymbolic import evaluate + from pymbolic import evaluate, substitute from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) - access_info = get_access_info(self.kernel, ary, index_tuple, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) + if self.kernel.target.allows_non_constant_indexing_for_vec_types: + access_info = get_access_info(self.kernel, ary, index_tuple, + lambda expr: substitute(expr, + dict(self.codegen_state.var_subst_map)), + self.codegen_state.vectorization_info) + else: + access_info = get_access_info(self.kernel, ary, index_tuple, + lambda expr: evaluate(expr, self.codegen_state.var_subst_map), + self.codegen_state.vectorization_info) from loopy.kernel.data import ( ImageArg, ArrayArg, TemporaryVariable, ConstantArg) diff --git a/loopy/target/c_vector_extensions.py b/loopy/target/c_vector_extensions.py new file mode 100644 index 000000000..27516fee0 --- /dev/null +++ b/loopy/target/c_vector_extensions.py @@ -0,0 +1,184 @@ +import numpy as np +from cgen import Declarator +from pytools import memoize_method +from loopy.target import VectorizationFallback +from loopy.target.c import CTarget, CWithGNULibcASTBuilder, ExecutableCTarget +from loopy.types import NumpyType +from loopy.kernel.array import (ArrayBase, FixedStrideArrayDimTag, + VectorArrayDimTag) + + +# {{{ vector types + +class vec: # noqa + pass + + +def _create_vector_types(): + field_names = ["x", "y", "z", "w"] + + vec.types = {} + vec.names_and_dtypes = [] + vec.type_to_scalar_and_count = {} + + counts = [2, 3, 4, 8, 16] + + for base_name, base_type in [ + ("char", np.int8), + ("unsigned char", np.uint8), + ("short", np.int16), + ("unsigned short", np.uint16), + ("int", np.int32), + ("unsigned int", np.uint32), + ("long", np.int64), + ("unsigned long", np.uint64), + ("float", np.float32), + ("double", np.float64), + ]: + for count in counts: + byte_count = count*np.dtype(base_type).itemsize + name = "%s __attribute__((vector_size(%d)))" % (base_name, + byte_count) + + titles = field_names[:count] + + names = [f"s{i}" for i in range(count)] + + if len(titles) < len(names): + titles.extend((len(names)-len(titles))*[None]) + + try: + dtype = np.dtype(dict( + names=names, + formats=[base_type]*count, + titles=titles)) + except NotImplementedError: + try: + dtype = np.dtype([((n, title), base_type) + for (n, title) in zip(names, titles)]) + except TypeError: + dtype = np.dtype([(n, base_type) for (n, title) + in zip(names, titles)]) + + setattr(vec, name, dtype) + + vec.names_and_dtypes.append((name, dtype)) + + vec.types[np.dtype(base_type), count] = dtype + vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count + + +_create_vector_types() + + +def _register_vector_types(dtype_registry): + for name, dtype in vec.names_and_dtypes: + dtype_registry.get_or_register_dtype(name, dtype) + +# }}} + + +# {{{ target + +class CVectorExtensionsTarget(CTarget): + """A specialized C-target that represents vectorization through GCC/Clang + language extensions. + """ + def __init__(self, + vec_fallback: VectorizationFallback = VectorizationFallback.UNROLL, + fortran_abi=False): + super().__init__(fortran_abi=fortran_abi) + self.vec_fallback = vec_fallback + + def get_host_ast_builder(self): + return CVectorExtensionsASTBuilder(self) + + def get_device_ast_builder(self): + return CVectorExtensionsASTBuilder(self) + + @memoize_method + def get_dtype_registry(self): + from loopy.target.c.compyte.dtypes import ( + DTypeRegistry, fill_registry_with_c99_stdint_types, + fill_registry_with_c99_complex_types) + from loopy.target.c import DTypeRegistryWrapper + + result = DTypeRegistry() + fill_registry_with_c99_stdint_types(result) + fill_registry_with_c99_complex_types(result) + + _register_vector_types(result) + return DTypeRegistryWrapper(result) + + def is_vector_dtype(self, dtype): + return (isinstance(dtype, NumpyType) + and dtype.numpy_dtype in list(vec.types.values())) + + def vector_dtype(self, base, count): + return NumpyType( + vec.types[base.numpy_dtype, count], + target=self) + + @property + def allows_non_constant_indexing_for_vec_types(self): + return True + + @property + def broadcasts_scalar_assignment_to_vec_types(self): + return False + + @property + def vectorization_fallback(self): + return self.vec_fallback + + +class ExecutableCVectorExtensionsTarget(CVectorExtensionsTarget, + ExecutableCTarget): + def __init__(self, + vec_fallback: VectorizationFallback = VectorizationFallback.UNROLL, + compiler=None, + fortran_abi=False): + ExecutableCTarget.__init__(self, compiler=compiler, fortran_abi=fortran_abi) + self.vec_fallback = vec_fallback + + def get_kernel_executor_cache_key(self, *args, **kwargs): + return ExecutableCTarget.get_kernel_executor_cache_key(self, *args, **kwargs) + + def get_kernel_executor(self, t_unit, *args, **kwargs): + return ExecutableCTarget.get_kernel_executor(self, t_unit, *args, **kwargs) + + @property + def is_executable(self) -> bool: + return True + +# }}} + + +# {{{ AST builder + +class CVectorExtensionsASTBuilder(CWithGNULibcASTBuilder): + def add_vector_access(self, access_expr, index): + return access_expr[index] + + def get_array_base_declarator(self, ary: ArrayBase) -> Declarator: + from loopy.target.c import POD + dtype = ary.dtype + vec_size = ary.vector_size(self.target) + if vec_size > 1: + dtype = self.target.vector_dtype(dtype, vec_size) + + if ary.dim_tags: + for dim_tag in ary.dim_tags: + if isinstance(dim_tag, (FixedStrideArrayDimTag, + VectorArrayDimTag)): + # we're OK with that + pass + else: + raise NotImplementedError( + f"{type(self).__name__} does not understand axis tag " + f"'{type(dim_tag)}.") + + arg_decl = POD(self, dtype, ary.name) + return arg_decl + +# }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 4a311f887..9cdac3259 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -255,6 +255,23 @@ def vector_dtype(self, base, count): # }}} + @property + def is_executable(self) -> bool: + return False + + @property + def allows_non_constant_indexing_for_vec_types(self): + return False + + @property + def broadcasts_scalar_assignment_to_vec_types(self): + return True + + @property + def vectorization_fallback(self): + from loopy.target import VectorizationFallback + return VectorizationFallback.UNROLL + # }}} diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 2fbd6bcf8..0d820a406 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -198,6 +198,23 @@ def get_dtype_registry(self): # }}} + @property + def is_executable(self) -> bool: + return False + + @property + def allows_non_constant_indexing_for_vec_types(self): + return False + + @property + def broadcasts_scalar_assignment_to_vec_types(self): + return True + + @property + def vectorization_fallback(self): + from loopy.target import VectorizationFallback + return VectorizationFallback.UNROLL + class ISPCASTBuilder(CFamilyASTBuilder): # {{{ top-level codegen @@ -222,7 +239,9 @@ def get_function_declaration( # subkernel launches occur only as part of entrypoint kernels for now from loopy.schedule.tools import get_subkernel_arg_info skai = get_subkernel_arg_info(codegen_state.kernel, subkernel_name) - passed_names = skai.passed_names + passed_names = (skai.passed_names + if self.target.is_executable + else [arg.name for arg in kernel.args]) written_names = skai.written_names else: passed_names = [arg.name for arg in kernel.args] @@ -263,7 +282,7 @@ def get_kernel_call(self, codegen_state: CodeGenerationState, "assert(programCount == (%s))" % ecm(lsize[0], PREC_NONE))) - if codegen_state.is_entrypoint: + if codegen_state.is_entrypoint and self.target.is_executable: # subkernel launches occur only as part of entrypoint kernels for now from loopy.schedule.tools import get_subkernel_arg_info skai = get_subkernel_arg_info(codegen_state.kernel, subkernel_name) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 6957d3f96..cfe44b113 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -598,6 +598,23 @@ def is_vector_dtype(self, dtype): def vector_dtype(self, base, count): return NumpyType(vec.types[base.numpy_dtype, count]) + @property + def is_executable(self) -> bool: + return False + + @property + def allows_non_constant_indexing_for_vec_types(self): + return False + + @property + def broadcasts_scalar_assignment_to_vec_types(self): + return True + + @property + def vectorization_fallback(self): + from loopy.target import VectorizationFallback + return VectorizationFallback.UNROLL + # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index cb78e571b..641a40bef 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -551,6 +551,10 @@ def with_device(self, device): "stop working in 2022.", DeprecationWarning, stacklevel=2) return self + @property + def is_executable(self) -> bool: + return True + # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index cbf6aca24..844fbfb63 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -225,6 +225,11 @@ def ast_block_scope_class(self): # and delete the implementation above. return Collection + @property + def ast_comment_class(self): + from genpy import Comment + return Comment + def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper diff --git a/test/test_target.py b/test/test_target.py index b55565a0d..d0ac2138f 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -777,6 +777,99 @@ def test_passing_bajillions_of_svm_args(ctx_factory, with_gbarrier): assert (res[f"c{iargset}"].get() == iargset * multiplier + iargset).all() +def test_non_executable_targets_respect_args(): + # See https://github.com/inducer/loopy/issues/648 + t_unit = lp.make_kernel( + "{ : }", + """ + a[0] = 1729 + """, + [lp.GlobalArg("a,b,c,d,e", + shape=(10,), + dtype="float64")], + target=lp.CTarget() + ) + code_str = lp.generate_code_v2(t_unit).device_code() + + for var in ["b", "c", "d", "e"]: + assert code_str.find(f"double const *__restrict__ {var}") != -1 + + +def test_c_vector_extensions(): + knl = lp.make_kernel( + "{[i, j1, j2, j3]: 0<=i<10 and 0<=j1,j2,j3<4}", + """ + <> temp1[j1] = x[i, j1] + <> temp2[j2] = 2*temp1[j2] + 1 {inames=i:j2} + y[i, j3] = temp2[j3] + """, + [lp.GlobalArg("x, y", shape=lp.auto, dtype=float)], + seq_dependencies=True, + target=lp.CVectorExtensionsTarget()) + + knl = lp.tag_inames(knl, "j2:vec, j1:ilp, j3:ilp") + knl = lp.tag_array_axes(knl, "temp1,temp2", "vec") + + print(lp.generate_code_v2(knl).device_code()) + + +def test_vec_tag_with_omp_simd_fallback(): + knl = lp.make_kernel( + "{[i, j1, j2, j3]: 0<=i<10 and 0<=j1,j2,j3<4}", + """ + <> temp1[j1] = x[i, j1] + <> temp2[j2] = 2*temp1[j2] + 1 {inames=i:j2} + y[i, j3] = temp2[j3] + """, + [lp.GlobalArg("x, y", shape=lp.auto, dtype=float)], + seq_dependencies=True, + target=lp.ExecutableCVectorExtensionsTarget( + lp.VectorizationFallback.OMP_SIMD) + ) + + knl = lp.tag_inames(knl, {"j1": "vec", + "j2": "vec", + "j3": "vec"}) + knl = lp.tag_array_axes(knl, "temp1,temp2", "vec") + + code_str = lp.generate_code_v2(knl).device_code() + + assert len([line + for line in code_str.split("\n") + if line.strip() == "#pragma omp simd"]) == 2 + + x = np.random.rand(10, 4) + _, (out,) = knl(x=x) + np.testing.assert_allclose(out, 2*x+1) + + +def test_vec_extensions_with_multiple_loopy_body_insns(): + knl = lp.make_kernel( + "{[n]: 0<=n tmp = 2.0 + dat0[n, 0] = tmp {id=expr_insn} + ... nop {id=statement0} + end + """, + seq_dependencies=True, + target=lp.ExecutableCVectorExtensionsTarget( + lp.VectorizationFallback.OMP_SIMD) + ) + + knl = lp.add_dtypes(knl, {"dat0": "float64"}) + knl = lp.split_iname(knl, "n", 4, slabs=(1, 1), + inner_iname="n_batch") + knl = lp.privatize_temporaries_with_inames(knl, "n_batch") + knl = lp.tag_array_axes(knl, "tmp", "vec") + knl = lp.tag_inames(knl, {"n_batch": "vec"}) + + _, (out,) = knl(N=100) + np.testing.assert_allclose(out, 2*np.ones((100, 1))) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])