From 746e2fb43316fbe86c9d886b0e574bde1100c570 Mon Sep 17 00:00:00 2001 From: mtinti Date: Sat, 20 Jun 2026 11:00:10 +0100 Subject: [PATCH 1/3] Add cohort scripting commands (export / rebuild) Two CLI commands under CommandExecution/AtomicCommands/CohortScript: - ExportCohortAsScript: decompiles a CohortIdentificationConfiguration into a portable, data-free folder: requirement.md placeholder, build.script.yaml, query.sql, and catalogue-manifest.yaml (the extractable columns, patient identifier column(s) and published filters of ONLY the catalogues this cohort uses). The build script captures set operations, nested cohort sub-containers and child order, published-filter imports with parameters, hand-written filters, global and aggregate-level parameters, nested AND/OR filter containers, patient index tables (joinables) with their joins/filters/params, forced joins, disabled sets and sub-containers, customised dimension SQL, and the Project association. No patient data leaves - only catalogue/table/column names and filter logic. - BuildCohortFromScript: the inverse - replays a build.script.yaml in-process to recreate an identical cohort (binds handles via NewObjectPool, restores child order, rebuilds patient index tables and rewrites their instance-specific alias). Round-trip verified on a fixture exercising all of the above: export -> rebuild -> re-export -> execute returns identical cohort membership. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ExecuteCommandBuildCohortFromScript.cs | 402 ++++++++++++++ .../ExecuteCommandExportCohortAsScript.cs | 504 ++++++++++++++++++ 2 files changed, 906 insertions(+) create mode 100644 Rdmp.Core/CommandExecution/AtomicCommands/CohortScript/ExecuteCommandBuildCohortFromScript.cs create mode 100644 Rdmp.Core/CommandExecution/AtomicCommands/CohortScript/ExecuteCommandExportCohortAsScript.cs diff --git a/Rdmp.Core/CommandExecution/AtomicCommands/CohortScript/ExecuteCommandBuildCohortFromScript.cs b/Rdmp.Core/CommandExecution/AtomicCommands/CohortScript/ExecuteCommandBuildCohortFromScript.cs new file mode 100644 index 0000000000..b7bbf7597c --- /dev/null +++ b/Rdmp.Core/CommandExecution/AtomicCommands/CohortScript/ExecuteCommandBuildCohortFromScript.cs @@ -0,0 +1,402 @@ +// Rebuilds a cohort from a build.script.yaml produced by ExecuteCommandExportCohortAsScript. +// Replays each command in-process, binding the $c/$a/$p handles to the real ids RDMP assigns +// as each object is created (via NewObjectPool), deletes the auto Inclusion/Exclusion +// containers, and restores child Order (each Add inserts at the top, which would reverse it). +// +// rdmp cmd BuildCohortFromScript .\out\MyCohort\build.script.yaml "MyCohort (rebuilt)" +// +// This is the inverse of ExportCohortAsScript. Verified to reproduce identical membership. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using Rdmp.Core.CommandExecution.Combining; +using Rdmp.Core.CommandLine.Interactive.Picking; +using Rdmp.Core.Curation.Data; +using Rdmp.Core.Curation.Data.Aggregation; +using Rdmp.Core.Curation.Data.Cohort; +using Rdmp.Core.Curation.Data.Cohort.Joinables; +using Rdmp.Core.DataExport.Data; +using Rdmp.Core.MapsDirectlyToDatabaseTable; +using Rdmp.Core.Repositories; +using YamlDotNet.Serialization; + +namespace Rdmp.Core.CommandExecution.AtomicCommands.CohortScript; + +public class ExecuteCommandBuildCohortFromScript : BasicCommandExecution +{ + private readonly FileInfo _scriptFile; + private readonly string _newName; + private readonly Dictionary _handles = new(); // "$c25" -> real id + private readonly Dictionary _ixMap = new(); // old joinable id -> new (for ix#### alias) + private CohortIdentificationConfiguration _currentCic; + + public ExecuteCommandBuildCohortFromScript(IBasicActivateItems activator, + [DemandsInitialization("The build.script.yaml produced by ExportCohortAsScript")] + FileInfo scriptFile, + [DemandsInitialization("Name for the rebuilt cohort (must be unique)")] + string newCohortName = null) : base(activator) + { + _scriptFile = scriptFile; + _newName = newCohortName; + if (scriptFile != null && !scriptFile.Exists) + SetImpossible("Script file does not exist"); // null is allowed - the GUI prompts + } + + private class ScriptDto { public string[] Commands { get; set; } } + + public override void Execute() + { + base.Execute(); + + // From the GUI the file/name may be unset - prompt for them. + var file = _scriptFile ?? BasicActivator.SelectFile("Select build.script.yaml to rebuild", "Cohort script", "*.yaml"); + if (file == null) return; + var newName = _newName; + if (string.IsNullOrWhiteSpace(newName) && + (!BasicActivator.TypeText("Rebuild Cohort", "Name for the rebuilt cohort", 200, null, out newName, false) + || string.IsNullOrWhiteSpace(newName))) + return; + + var script = new Deserializer().Deserialize(File.ReadAllText(file.FullName)); + if (script?.Commands == null || script.Commands.Length == 0) + throw new Exception("Script contained no Commands"); + + var repo = BasicActivator.RepositoryLocator.CatalogueRepository; + var invoker = new CommandInvoker(BasicActivator); + var byName = new Dictionary(StringComparer.InvariantCultureIgnoreCase); + foreach (var t in invoker.GetSupportedCommands()) + byName.TryAdd(BasicCommandExecution.GetCommandName(t.Name), t); + + // (parentContainerId, isAggregate, childId) in the order children were added + var adds = new List<(int parent, bool isAgg, int child)>(); + + using (NewObjectPool.StartSession()) + { + for (var cmdIndex = 0; cmdIndex < script.Commands.Length; cmdIndex++) + { + var line = script.Commands[cmdIndex]; + string binds = null; + var i = line.IndexOf(" => ", StringComparison.Ordinal); + if (i >= 0) { binds = line[(i + 4)..].Trim(); line = line[..i]; } + + if (line.StartsWith("CreateNewCohortIdentificationConfiguration", StringComparison.OrdinalIgnoreCase) + && !string.IsNullOrWhiteSpace(newName)) + line = $"CreateNewCohortIdentificationConfiguration \"{newName}\""; + + line = Substitute(line); + + // rewrite patient-index-table aliases ix -> ix + // (the join alias is instance-specific; the PIT was created earlier this run). + foreach (var kv in _ixMap) + line = line.Replace($"ix{kv.Key}.", $"ix{kv.Value}."); + + // runner directive: create a patient index table (joinable) from a catalogue, with + // the given non-identifier dimensions, then convert it to a PIT. + // CreatePatientIndexTable Catalogue: Dimensions:"col1,col2" => $pit + if (line.StartsWith("CreatePatientIndexTable", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); // CreatePatientIndexTable Catalogue: Aggregate: Dimensions:"a,b" + var cataName = Field(t, "Catalogue:"); + var oldAggId = Field(t, "Aggregate:"); + var dimCols = Field(t, "Dimensions:").Split(',', StringSplitOptions.RemoveEmptyEntries); + var cata = repo.GetAllObjects().First(c => c.Name == cataName); + + var aggCmd = new CatalogueCombineable(cata).GenerateAggregateConfigurationFor(BasicActivator, _currentCic); + foreach (var col in dimCols) + { + var ei = cata.GetAllExtractionInformation().FirstOrDefault(e => e.GetRuntimeName() == col); + if (ei != null) _ = new AggregateDimension(repo, ei, aggCmd.Aggregate); + } + // bind the PIT aggregate's $a so later SetDimensionSql overrides can target it + if (oldAggId != null) _handles[$"$a{oldAggId}"] = aggCmd.Aggregate.ID; + new ExecuteCommandConvertAggregateConfigurationToPatientIndexTable(BasicActivator, aggCmd, _currentCic).Execute(); + + var joinable = (JoinableCohortAggregateConfiguration)NewObjectPool.Latest( + repo.GetAllObjects()); + if (binds != null) + { + _handles[binds] = joinable.ID; // $pit -> new joinable id + _ixMap[int.Parse(binds[4..])] = joinable.ID; // old id (after "$pit") -> new + } + continue; + } + + // runner directive: restore a dimension's customised SelectSQL (e.g. the extraction + // identifier qualified to [db]..[tbl].[col] so a PIT join's chi isn't ambiguous). + // SetDimensionSql "" "" + if (line.StartsWith("SetDimensionSql", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); + var dim = FindDimension(repo, int.Parse(t[1]), t[2]); + if (dim != null) { dim.SelectSQL = t[3]; dim.SaveToDatabase(); } + continue; + } + + // runner directive: restore a dimension's column Alias. Functional for patient + // index tables: the joining filter references ix.. + // SetDimensionAlias "" "" + if (line.StartsWith("SetDimensionAlias", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); + var dim = FindDimension(repo, int.Parse(t[1]), t[2]); + if (dim != null) { dim.Alias = t[3]; dim.SaveToDatabase(); } + continue; + } + + // runner directive: make a cohort set join to a patient index table. + // UsePatientIndexTable + if (line.StartsWith("UsePatientIndexTable", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); + var setAgg = repo.GetObjectByID(int.Parse(t[1])); + var joinable = repo.GetObjectByID(int.Parse(t[2])); + var use = joinable.AddUser(setAgg); + use.JoinType = Enum.Parse(t[3], true); + use.SaveToDatabase(); + continue; + } + + // runner directive: associate the new CIC with a Project (so project-specific + // catalogues can be added). Handled directly, not via a command. + if (line.StartsWith("AssociateWithProject", StringComparison.OrdinalIgnoreCase)) + { + var pid = int.Parse(line[(line.IndexOf("Project:", StringComparison.Ordinal) + 8)..].Trim()); + var dx = BasicActivator.RepositoryLocator.DataExportRepository; + _ = new ProjectCohortIdentificationConfigurationAssociation(dx, dx.GetObjectByID(pid), _currentCic); + continue; + } + + // SetContainerOperation on a NAMED cohort container (Root/Inclusion/Exclusion) + // prompts for a rename and fails headless - set it directly instead. + if (line.StartsWith("SetContainerOperation CohortAggregateContainer:", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); + var cont = repo.GetObjectByID(int.Parse(t[1][(t[1].IndexOf(':') + 1)..])); + cont.Operation = Enum.Parse(t[2], true); + cont.SaveToDatabase(); + continue; + } + + // runner directive: aggregate-level parameter. + // AddAggregateParameter "name" "DECLARE @x AS type" "value" + if (line.StartsWith("AddAggregateParameter", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); // [cmd, aggId, name, parameterSQL, value] + var agg = repo.GetObjectByID(int.Parse(t[1])); + new AnyTableSqlParameter(repo, agg, t[3]) { Value = t[4] }.SaveToDatabase(); + continue; + } + + // runner directive: a parameter on a HAND-WRITTEN filter (imported filters get + // theirs auto-created, but CreateNewFilter "name" "sql" does not). + // AddFilterParameter "name" "DECLARE @x AS type" "value" + if (line.StartsWith("AddFilterParameter", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); // [cmd, filterId, name, parameterSQL, value] + var filter = repo.GetObjectByID(int.Parse(t[1])); + var p = (AggregateFilterParameter)filter.GetFilterFactory().CreateNewParameter(filter, t[3]); + p.Value = t[4]; + p.SaveToDatabase(); + continue; + } + + // runner directive: cohort-level (global) parameter on the CIC itself. + // AddGlobalParameter "name" "DECLARE @x AS type" "value" + if (line.StartsWith("AddGlobalParameter", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); // [cmd, name, parameterSQL, value] + new AnyTableSqlParameter(repo, _currentCic, t[2]) { Value = t[3] }.SaveToDatabase(); + continue; + } + + // runner directive: force a table into an aggregate's query (AggregateForcedJoin). + // AddForcedJoin TableInfo: + if (line.StartsWith("AddForcedJoin", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); + var agg = repo.GetObjectByID(int.Parse(t[1])); + var tiName = Field(t, "TableInfo:"); + var ti = repo.GetAllObjects().FirstOrDefault(x => x.Name == tiName); + if (ti != null) repo.AggregateForcedJoinManager.CreateLinkBetween(agg, ti); + continue; + } + + // runner directive: make the aggregate's root filter container with an AND/OR op. + // EnsureFilterContainer => $fcN + if (line.StartsWith("EnsureFilterContainer", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); + var agg = repo.GetObjectByID(int.Parse(t[1])); + int fcid; + if (agg.RootFilterContainer_ID == null) + { + var nfc = new AggregateFilterContainer(repo, Enum.Parse(t[2], true)); + agg.RootFilterContainer_ID = nfc.ID; + agg.SaveToDatabase(); + fcid = nfc.ID; + } + else fcid = agg.RootFilterContainer_ID.Value; + if (binds != null) _handles[binds] = fcid; + continue; + } + + // runner directive: add an AND/OR sub-container under a filter container. + // AddFilterSubContainer => $fcN + if (line.StartsWith("AddFilterSubContainer", StringComparison.OrdinalIgnoreCase)) + { + var t = Tokenize(line); + var parent = repo.GetObjectByID(int.Parse(t[1])); + var sub = new AggregateFilterContainer(repo, Enum.Parse(t[2], true)); + parent.AddChild(sub); + if (binds != null) _handles[binds] = sub.ID; + continue; + } + + var tokens = Tokenize(line); + if (!byName.TryGetValue(tokens[0], out var type)) + throw new Exception($"Unknown command '{tokens[0]}'"); + + invoker.ExecuteCommand(type, new CommandLineObjectPicker(tokens.Skip(1).ToArray(), BasicActivator)); + + BindAndTrack(tokens[0], line, binds, repo, adds); + } + + RestoreOrder(repo, adds); + } + + var cic = repo.GetAllObjectsWhere("Name", newName).FirstOrDefault(); + BasicActivator.Show($"Rebuilt cohort '{newName}'" + (cic != null ? $" (ID {cic.ID})" : "")); + if (cic != null) Publish(cic); + } + + private void BindAndTrack(string cmd, string line, string binds, ICatalogueRepository repo, + List<(int, bool, int)> adds) + { + switch (cmd) + { + case "CreateNewCohortIdentificationConfiguration": + var cic = (CohortIdentificationConfiguration)NewObjectPool.Latest( + repo.GetAllObjects()); + _currentCic = cic; + var root = cic.RootCohortAggregateContainer; + if (root != null) + { + if (binds != null) _handles[binds] = root.ID; + // remove the auto Inclusion/Exclusion containers (the script recreates what it needs) + foreach (var sub in root.GetSubContainers() + .Where(s => s.Name is "Inclusion Criteria" or "Exclusion Criteria")) + sub.DeleteInDatabase(); + } + break; + + case "AddCohortSubContainer": + var newC = (CohortAggregateContainer)NewObjectPool.Latest( + repo.GetAllObjects()); + if (binds != null) _handles[binds] = newC.ID; + adds.Add((ParentContainerId(line), false, newC.ID)); + break; + + case "AddCatalogueToCohortIdentificationSetContainer": + var newA = (AggregateConfiguration)NewObjectPool.Latest( + repo.GetAllObjects()); + if (binds != null) _handles[binds] = newA.ID; + adds.Add((ParentContainerId(line), true, newA.ID)); + break; + + case "CreateNewFilter": + var filter = (AggregateFilter)NewObjectPool.Latest(repo.GetAllObjects()); + var handles = (binds ?? "").Split(' ', StringSplitOptions.RemoveEmptyEntries); + if (handles.Length == 1 && handles[0].StartsWith("$f", StringComparison.Ordinal)) + { + // hand-written filter: bind the FILTER so AddFilterParameter can create its params + _handles[handles[0]] = filter.ID; + } + else + { + // imported filter: params were auto-created; bind their handles by position + var ps = filter.GetAllParameters().OfType().ToArray(); + for (var k = 0; k < handles.Length && k < ps.Length; k++) + _handles[handles[k]] = ps[k].ID; + } + break; + } + } + + private static void RestoreOrder(ICatalogueRepository repo, List<(int parent, bool isAgg, int child)> adds) + { + foreach (var grp in adds.GroupBy(a => a.parent)) + { + var n = 0; + foreach (var (_, isAgg, child) in grp) + { + if (isAgg) + repo.CohortContainerManager.SetOrder(repo.GetObjectByID(child), n); + else + { + var c = repo.GetObjectByID(child); + c.Order = n; + c.SaveToDatabase(); + } + n++; + } + } + } + + private string Substitute(string line) + { + foreach (var kv in _handles.OrderByDescending(k => k.Key.Length)) + line = line.Replace(kv.Key, kv.Value.ToString()); + return line; + } + + private static int ParentContainerId(string line) + { + // first CohortAggregateContainer: in the (already-substituted) line + const string key = "CohortAggregateContainer:"; + var at = line.IndexOf(key, StringComparison.Ordinal); + if (at < 0) return -1; + var s = at + key.Length; + var e = s; + while (e < line.Length && char.IsDigit(line[e])) e++; + return int.TryParse(line[s..e], out var id) ? id : -1; + } + + // Finds an aggregate's dimension by its UNDERLYING column name (stable across aliasing); + // falls back to the dimension's runtime name for scripts that predate alias support. + private static AggregateDimension FindDimension(ICatalogueRepository repo, int aggId, string column) + { + var agg = repo.GetObjectByID(aggId); + return agg.AggregateDimensions.FirstOrDefault(d => d.ExtractionInformation?.GetRuntimeName() == column) + ?? agg.AggregateDimensions.FirstOrDefault(d => d.GetRuntimeName() == column); + } + + // value of a "Key:value" token (quotes already stripped by Tokenize), or null if absent + private static string Field(List tokens, string key) + { + var tok = tokens.FirstOrDefault(x => x.StartsWith(key, StringComparison.OrdinalIgnoreCase)); + return tok?[key.Length..]; + } + + // split on spaces, honouring double quotes anywhere in a token; quote chars are dropped + private static List Tokenize(string line) + { + var tokens = new List(); + var sb = new StringBuilder(); + var inQuote = false; + var has = false; + foreach (var ch in line) + { + if (ch == '"') { inQuote = !inQuote; has = true; } + else if (ch == ' ' && !inQuote) + { + if (has) { tokens.Add(sb.ToString()); sb.Clear(); has = false; } + } + else { sb.Append(ch); has = true; } + } + if (has) tokens.Add(sb.ToString()); + return tokens; + } +} diff --git a/Rdmp.Core/CommandExecution/AtomicCommands/CohortScript/ExecuteCommandExportCohortAsScript.cs b/Rdmp.Core/CommandExecution/AtomicCommands/CohortScript/ExecuteCommandExportCohortAsScript.cs new file mode 100644 index 0000000000..2db36d8c45 --- /dev/null +++ b/Rdmp.Core/CommandExecution/AtomicCommands/CohortScript/ExecuteCommandExportCohortAsScript.cs @@ -0,0 +1,504 @@ +// Exports a Cohort Identification Configuration (CIC) as a portable, data-free triple of text files: +// //requirement.md //build.script.yaml //query.sql +// +// build.script.yaml is a runnable command script that ExecuteCommandBuildCohortFromScript replays +// to recreate an identical cohort; query.sql is the SQL RDMP would run. No patient data leaves - +// only catalogue/table/column names and the cohort's filter logic. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using Rdmp.Core.Curation.Data; +using Rdmp.Core.Curation.Data.Aggregation; +using Rdmp.Core.Curation.Data.Cohort; +using Rdmp.Core.DataExport.Data; +using Rdmp.Core.QueryBuilding; +using YamlDotNet.Serialization; + +namespace Rdmp.Core.CommandExecution.AtomicCommands.CohortScript; + +public class ExecuteCommandExportCohortAsScript : BasicCommandExecution +{ + private readonly CohortIdentificationConfiguration _cic; + private readonly DirectoryInfo _outDir; + + public ExecuteCommandExportCohortAsScript(IBasicActivateItems activator, + [DemandsInitialization("The cohort to export")] + CohortIdentificationConfiguration cic, + [DemandsInitialization("Folder to write the export into")] + DirectoryInfo toDir = null) : base(activator) + { + _cic = cic; + _outDir = toDir ?? new DirectoryInfo(Environment.CurrentDirectory); + + if (_cic == null) + SetImpossible("No CohortIdentificationConfiguration was supplied"); + } + + public override void Execute() + { + base.Execute(); + + var dir = new DirectoryInfo(Path.Join(_outDir.FullName, Sanitise(_cic.Name))); + dir.Create(); + + // requirement.md is an intentionally-empty placeholder. The natural-language + // requirement is added by hand later (extracted from the request form) - it is NOT + // taken from CIC.Description, which may be unrelated. Never overwrite a requirement + // that has already been filled in, so re-exporting is safe. + var reqPath = Path.Join(dir.FullName, "requirement.md"); + if (!File.Exists(reqPath)) + File.WriteAllText(reqPath, $"\n"); + + File.WriteAllText(Path.Join(dir.FullName, "build.script.yaml"), BuildScript()); + + // query.sql: the SQL as RDMP would run it - this uses the cohort's QueryCache if one + // is configured (so it references cache tables). SQL generation is best-effort. + File.WriteAllText(Path.Join(dir.FullName, "query.sql"), BuildSql(useCache: true)); + + // query.uncached.sql: the full query against the raw tables, cache bypassed. Only emitted + // when a cache is configured (otherwise query.sql is already the un-cached query). + if (_cic.QueryCachingServer_ID.HasValue) + File.WriteAllText(Path.Join(dir.FullName, "query.uncached.sql"), BuildSql(useCache: false)); + + // catalogue-manifest.yaml: the "menu" of building blocks - but ONLY the catalogues this + // cohort actually uses (its cohort sets + patient index tables), with their extractable + // columns, patient-identifier column(s) and published filters. Scoped this way it stays + // relevant and avoids dumping the whole platform's catalogues. + File.WriteAllText(Path.Join(dir.FullName, "catalogue-manifest.yaml"), BuildCatalogueManifest()); + + BasicActivator.Show($"Exported '{_cic.Name}' to {dir.FullName}"); + } + + private string BuildSql(bool useCache) + { + try + { + var builder = new CohortQueryBuilder(_cic, null); + if (!useCache && _cic.QueryCachingServer_ID.HasValue) + builder.CacheServer = null; // force the query to run against the raw tables + return builder.SQL ?? ""; + } + catch (Exception e) + { + // The whole cohort could not be assembled into a single runnable statement - most often + // because its sets are on different servers / use different credentials and no QueryCache + // is configured (RDMP cannot UNION/INTERSECT/EXCEPT across servers). Rather than lose + // everything to one error line, emit each set's SQL individually plus notes on what could + // not be combined. + return BuildBestEffortSql(e); + } + } + + // Best-effort SQL when the whole-cohort query cannot be generated: emit each cohort set's SQL + // (each set is single-server so it generates fine), mirror the set-operation tree as comments, + // and end with a notes block describing what could not be converted. + private string BuildBestEffortSql(Exception fullBuildError) + { + var lines = new List(); + var failures = new List(); + var servers = new HashSet(StringComparer.OrdinalIgnoreCase); + + lines.Add("-- ============================================================================"); + lines.Add("-- BEST-EFFORT SQL"); + lines.Add("-- The full cohort query could not be assembled into a single runnable statement:"); + lines.Add($"-- {OneLine(fullBuildError.Message)}"); + lines.Add("-- This usually means the cohort's sets are on different servers / use different"); + lines.Add("-- credentials and no QueryCache is configured, so they cannot be combined in one"); + lines.Add("-- query. Each cohort set's SQL is emitted individually below; to run the whole"); + lines.Add("-- cohort configure a QueryCache on the CohortIdentificationConfiguration (or stage"); + lines.Add("-- the per-set results and combine them with the set operations shown as comments)."); + lines.Add("-- ============================================================================"); + lines.Add(""); + + if (_cic.RootCohortAggregateContainer is { } root) + EmitBestEffortContainer(root, 0, lines, failures, servers); + else + lines.Add("-- (this cohort has no root container)"); + + lines.Add(""); + lines.Add("-- ============================================================================"); + lines.Add("-- COULD NOT BE CONVERTED TO A SINGLE SQL QUERY:"); + if (servers.Count > 1) + lines.Add($"-- * The sets above span {servers.Count} server(s)/credential(s): {string.Join(", ", servers.OrderBy(s => s))}."); + lines.Add("-- * SQL Server cannot UNION/INTERSECT/EXCEPT across servers without a QueryCache"); + lines.Add("-- (or linked servers). Apply the set operations shown as comments via a QueryCache"); + lines.Add("-- or by combining the per-set results manually."); + foreach (var f in failures) + lines.Add($"-- * {f}"); + lines.Add("-- ============================================================================"); + + return string.Join("\n", lines) + "\n"; + } + + private void EmitBestEffortContainer(CohortAggregateContainer container, int depth, + List lines, List failures, HashSet servers) + { + var indent = new string(' ', depth * 2); + lines.Add($"{indent}/* container \"{container.Name}\" [{container.Operation}] */"); + + var first = true; + foreach (var content in container.GetOrderedContents()) + { + if (!first) + lines.Add($"{indent}-- {container.Operation}"); + first = false; + + switch (content) + { + case AggregateConfiguration agg: + var server = ServerOf(agg); + if (server != null) servers.Add(server); + lines.Add($"{indent}/* set: \"{agg.Name}\" (server: {server ?? "unknown"}) */"); + var sql = SingleSetSql(agg, out var err); + if (err != null) + { + lines.Add($"{indent}-- (this set's SQL could not be generated: {OneLine(err)})"); + failures.Add($"Set \"{agg.Name}\" could not be generated: {OneLine(err)}"); + } + else + { + foreach (var sqlLine in sql.Replace("\r", "").Split('\n')) + lines.Add(indent + sqlLine.TrimEnd()); + } + + break; + case CohortAggregateContainer sub: + EmitBestEffortContainer(sub, depth + 1, lines, failures, servers); + break; + } + } + } + + // A single cohort set is always on one server, so its SQL generates even when the whole cohort + // (which may span servers) cannot. Still guarded: a set could itself be cross-server (e.g. a + // patient index table join to another server). + private string SingleSetSql(AggregateConfiguration agg, out string error) + { + try + { + error = null; + return new CohortQueryBuilder(agg, _cic.GetAllParameters(), null).SQL ?? ""; + } + catch (Exception e) + { + error = e.Message; + return ""; + } + } + + private static string ServerOf(AggregateConfiguration agg) + { + try + { + var ti = agg.Catalogue?.GetTableInfoList(false).FirstOrDefault(); + return ti == null ? null : $"{ti.Server}/{ti.Database}"; + } + catch + { + return null; + } + } + + // The distinct Catalogues used by this cohort: those behind its cohort sets and its + // patient index tables. Only these are dumped to the manifest. + private IEnumerable CohortCatalogues() + { + var seen = new HashSet(); + var result = new List(); + + void Add(Catalogue c) + { + if (c != null && seen.Add(c.ID)) result.Add(c); + } + + if (_cic.RootCohortAggregateContainer is { } root) + foreach (var agg in root.GetAllAggregateConfigurationsRecursively()) + Add(agg.Catalogue); + + foreach (var j in _cic.GetAllJoinables()) + Add(j.AggregateConfiguration?.Catalogue); + + return result; + } + + private string BuildCatalogueManifest() + { + var catalogues = CohortCatalogues() + .OrderBy(c => c.Name, StringComparer.OrdinalIgnoreCase) + .Select(c => + { + var eis = c.GetAllExtractionInformation(); + return (object)new Dictionary + { + ["id"] = c.ID, + ["name"] = c.Name, + ["identifier_columns"] = eis.Where(e => e.IsExtractionIdentifier) + .Select(e => e.GetRuntimeName()).ToList(), + ["columns"] = eis.Select(e => e.GetRuntimeName()).ToList(), + ["filters"] = c.GetAllFilters() + .OrderBy(f => f.Name, StringComparer.OrdinalIgnoreCase) + .Select(f => (object)new Dictionary + { + ["id"] = f.ID, + ["name"] = f.Name, + ["where"] = OneLine(f.WhereSQL), + ["parameters"] = f.GetAllParameters().Select(p => (object)new Dictionary + { + ["name"] = p.ParameterName, + ["declare"] = OneLine(p.ParameterSQL), + ["value"] = OneLine(p.Value), + ["comment"] = OneLine(p.Comment) + }).ToList() + }).ToList() + }; + }) + .ToList(); + + return new SerializerBuilder().Build() + .Serialize(new Dictionary { ["catalogues"] = catalogues }); + } + + private string BuildScript() + { + var root = _cic.RootCohortAggregateContainer; + + // ' => $handle' declares the handle bound to the object a creating command produces, so + // the runner (BuildCohortFromScript) can re-bind it to the real id it gets at build time. + var lines = new List + { + $"# Decompiled from CohortIdentificationConfiguration ID {_cic.ID}", + "Commands:", + root != null + ? $" - CreateNewCohortIdentificationConfiguration \"{_cic.Name}\" => {ContainerRef(root)}" + : $" - CreateNewCohortIdentificationConfiguration \"{_cic.Name}\"", + }; + + // If this cohort is tied to a Project (required to use project-specific catalogues), + // record it so the rebuilt cohort joins the same Project BEFORE its catalogues are added. + var assoc = BasicActivator.RepositoryLocator.DataExportRepository + .GetAllObjectsWhere( + "CohortIdentificationConfiguration_ID", _cic.ID).FirstOrDefault(); + if (assoc != null) + lines.Add($" - AssociateWithProject Project:{assoc.Project_ID}"); + + // Cohort-level (global) parameters - declared first so anything referencing them exists. + EmitGlobalParameters(lines); + + // Patient-index tables (joinables) must exist BEFORE the cohort sets that join to them. + // The runner creates each as $pit and rewrites the ix filter alias. + foreach (var j in _cic.GetAllJoinables()) + { + var pitAgg = j.AggregateConfiguration; + // listed by UNDERLYING column name (an aliased dimension's GetRuntimeName returns the + // alias, which would not match any ExtractionInformation on rebuild) + var dims = pitAgg.AggregateDimensions + .Where(d => d.ExtractionInformation is not { IsExtractionIdentifier: true }) + .Select(UnderlyingColumnName); + // Aggregate: lets the runner bind the rebuilt PIT aggregate to $a so the + // dimension-SQL overrides below (which restore e.g. the qualified chi) can target it. + lines.Add( + $" - CreatePatientIndexTable Catalogue:{Quote(pitAgg.Catalogue.Name)} Aggregate:{pitAgg.ID} Dimensions:\"{string.Join(",", dims)}\" => $pit{j.ID}"); + EmitDimensionOverrides(pitAgg, AggregateRef(pitAgg), lines); + // a patient index table can itself carry forced joins, parameters and a filter tree + EmitAggregateBody(pitAgg, AggregateRef(pitAgg), lines); + } + + if (root != null) + EmitContainer(root, lines); + else + lines.Add(" # (this CIC has no root container)"); + + // Disabled cohort sets / sub-containers are kept in the tree but excluded from the query. + // Emit these LAST (after the whole structure exists) so the disabled state is restored + // without affecting how the structure is built. + if (root != null) + EmitDisabledStates(root, lines); + + return string.Join("\n", lines) + "\n"; + } + + // Restores IsDisabled on any cohort set / sub-container that was disabled. Uses the generic + // Set command (same mechanism as filter-parameter values), so no special runner handling. + private void EmitDisabledStates(CohortAggregateContainer container, List lines) + { + if (container.IsDisabled) + lines.Add($" - Set CohortAggregateContainer:{ContainerRef(container)} IsDisabled true"); + + foreach (var content in container.GetOrderedContents()) + switch (content) + { + case AggregateConfiguration agg when agg.IsDisabled: + lines.Add($" - Set AggregateConfiguration:{AggregateRef(agg)} IsDisabled true"); + break; + case CohortAggregateContainer sub: + EmitDisabledStates(sub, lines); + break; + } + } + + private void EmitGlobalParameters(List lines) + { + // Cohort-level (global) SQL parameters - referenced by filters across any cohort set. + // Recreated on the rebuilt CIC by the runner. Filter parameters are emitted inline as Set + // commands after each filter, so they are not duplicated here. + foreach (var p in _cic.GetAllParameters()) + lines.Add( + $" - AddGlobalParameter \"{p.ParameterName}\" \"{OneLine(p.ParameterSQL)}\" \"{OneLine(p.Value)}\""); + } + + // Containers and aggregates are referenced by stable handles ($c / $a) rather than + // by name: names are not guaranteed unique or meaningful (an unnamed sub-container reports + // its operation as its name). The trailing comment keeps the script human-readable. + private static string ContainerRef(CohortAggregateContainer c) => $"$c{c.ID}"; + private static string AggregateRef(AggregateConfiguration a) => $"$a{a.ID}"; + + private void EmitContainer(CohortAggregateContainer container, List lines) + { + var cref = ContainerRef(container); + lines.Add($" - SetContainerOperation CohortAggregateContainer:{cref} {container.Operation} # {container.Name}"); + // Restore the container's name. Always emitted: it carries custom names (e.g. "UNION T2") + // and fixes the stale default label a rebuilt container would otherwise show (created as + // "UNION", then Operation changed directly without the rename the GUI command performs). + lines.Add($" - Set CohortAggregateContainer:{cref} Name \"{OneLine(container.Name)}\""); + + foreach (var content in container.GetOrderedContents()) + switch (content) + { + case AggregateConfiguration agg: + EmitAggregate(agg, cref, lines); + break; + case CohortAggregateContainer sub: + lines.Add($" - AddCohortSubContainer CohortAggregateContainer:{cref} => {ContainerRef(sub)}"); + EmitContainer(sub, lines); + break; + } + } + + private void EmitAggregate(AggregateConfiguration agg, string containerRef, List lines) + { + var cata = agg.Catalogue; + var cataRef = cata != null ? Quote(cata.Name) : ""; + lines.Add( + $" - AddCatalogueToCohortIdentificationSetContainer CohortAggregateContainer:{containerRef} Catalogue:{cataRef} => {AggregateRef(agg)}"); + + foreach (var dim in agg.AggregateDimensions) + lines.Add($" # dimension: {dim.GetRuntimeName()} (auto-set when catalogue added)"); + + // Restore any dimension whose SelectSQL was customised away from the catalogue default + // (e.g. the extraction identifier qualified to [db]..[tbl].[col] so a PIT join isn't ambiguous). + EmitDimensionOverrides(agg, AggregateRef(agg), lines); + + // join-uses: this cohort set joins to a patient-index table ($pit) created earlier. + foreach (var use in agg.PatientIndexJoinablesUsed) + lines.Add( + $" - UsePatientIndexTable {AggregateRef(agg)} $pit{use.JoinableCohortAggregateConfiguration_ID} {use.JoinType}"); + + EmitAggregateBody(agg, AggregateRef(agg), lines); + } + + // Forced joins, aggregate-level parameters and the filter tree of an aggregate. Shared by + // cohort sets and patient index tables (a PIT can have its own filters/params/forced joins too). + private void EmitAggregateBody(AggregateConfiguration agg, string aggRef, List lines) + { + // HAVING clause (e.g. "count(*) >= 2") - changes which patients the set matches. + if (!string.IsNullOrWhiteSpace(agg.HavingSQL)) + lines.Add($" - Set AggregateConfiguration:{aggRef} HavingSQL \"{OneLine(agg.HavingSQL)}\""); + + // forced joins: tables explicitly joined into this aggregate's query + foreach (var ti in agg.ForcedJoins) + lines.Add($" - AddForcedJoin {aggRef} TableInfo:{Quote(ti.Name)}"); + + // aggregate-level parameters (e.g. @window) - distinct from filter parameters. + // Query directly (agg.Parameters also filters on repository-type, which can miss). + foreach (var ap in BasicActivator.RepositoryLocator.CatalogueRepository + .GetAllObjects() + .Where(p => p.ReferencedObjectType == nameof(AggregateConfiguration) && p.ReferencedObjectID == agg.ID)) + lines.Add( + $" - AddAggregateParameter {aggRef} \"{ap.ParameterName}\" \"{OneLine(ap.ParameterSQL)}\" \"{OneLine(ap.Value)}\""); + + if (agg.RootFilterContainer is { } fc) + EmitFilters(fc, aggRef, lines); + } + + // Emits SetDimensionSql / SetDimensionAlias directives for every dimension whose SelectSQL or + // Alias was customised away from its catalogue ExtractionInformation default. Both are keyed by + // the UNDERLYING column name (not GetRuntimeName, which returns the alias once one is set) so + // the runner can find the dimension regardless of alias. An aliased PIT dimension is functional: + // the join filter references ix.. + private void EmitDimensionOverrides(AggregateConfiguration agg, string aggRef, List lines) + { + foreach (var dim in agg.AggregateDimensions) + { + var key = UnderlyingColumnName(dim); + var eiSql = dim.ExtractionInformation?.SelectSQL; + if (!string.IsNullOrWhiteSpace(dim.SelectSQL) && dim.SelectSQL != eiSql) + lines.Add($" - SetDimensionSql {aggRef} \"{key}\" \"{OneLine(dim.SelectSQL)}\""); + if (!string.IsNullOrWhiteSpace(dim.Alias)) + lines.Add($" - SetDimensionAlias {aggRef} \"{key}\" \"{dim.Alias}\""); + } + } + + // The dimension's underlying catalogue column name, stable across aliasing. + private static string UnderlyingColumnName(AggregateDimension dim) => + dim.ExtractionInformation?.GetRuntimeName() ?? dim.GetRuntimeName(); + + private void EmitFilters(IContainer rootFc, string aggRef, List lines) + { + // Build the aggregate's root filter container with the right AND/OR operation, then fill it. + // (Directives the runner handles directly; the CLI AddNewFilterContainer misbehaves headless.) + var key = $"fc{((AggregateFilterContainer)rootFc).ID}"; + lines.Add($" - EnsureFilterContainer {aggRef} {rootFc.Operation} => ${key}"); + if (((AggregateFilterContainer)rootFc).IsDisabled) + lines.Add($" - Set AggregateFilterContainer:${key} IsDisabled true"); + EmitContainerFilters(rootFc, key, lines); + } + + // fcKey (no leading $) is the handle of the container the filters/sub-containers go INTO. + private void EmitContainerFilters(IContainer fc, string fcKey, List lines) + { + foreach (var filter in fc.GetFilters()) + { + // Recreate every filter from its ACTUAL WhereSQL + parameters (exactly what the GUI + // clone does). We deliberately do NOT re-import via ExtractionFilter: the published + // master filter can have drifted from the cohort's copy (e.g. an EXISTS added/removed, + // or a different default value), which would silently change the query. CreateNewFilter + // with a literal WhereSQL does not auto-create parameters, so we bind the FILTER ($f) + // and create each parameter explicitly. + var afilter = (AggregateFilter)filter; + var afps = filter.GetAllParameters().OfType().ToArray(); + var fid = afilter.ID; + var fbind = afps.Length == 0 && !afilter.IsDisabled ? "" : $" => $f{fid}"; + lines.Add( + $" - CreateNewFilter AggregateFilterContainer:${fcKey} \"{filter.Name}\" \"{OneLine(filter.WhereSQL)}\"{fbind}"); + foreach (var afp in afps) + lines.Add( + $" - AddFilterParameter $f{fid} \"{afp.ParameterName}\" \"{OneLine(afp.ParameterSQL)}\" \"{OneLine(afp.Value)}\""); + // a disabled filter is excluded from the WHERE clause but kept in the tree + if (afilter.IsDisabled) + lines.Add($" - Set AggregateFilter:$f{fid} IsDisabled true"); + } + + foreach (var sub in fc.GetSubContainers()) + { + var subKey = $"fc{((AggregateFilterContainer)sub).ID}"; + lines.Add($" - AddFilterSubContainer ${fcKey} {sub.Operation} => ${subKey}"); + if (((AggregateFilterContainer)sub).IsDisabled) + lines.Add($" - Set AggregateFilterContainer:${subKey} IsDisabled true"); + EmitContainerFilters(sub, subKey, lines); + } + } + + private static string Quote(string name) => $"\"{name}\""; + private static string OneLine(string sql) => (sql ?? "").Replace("\r", " ").Replace("\n", " ").Trim(); + + private static string Sanitise(string name) + { + var sb = new StringBuilder(); + foreach (var c in name) + sb.Append(Array.IndexOf(Path.GetInvalidFileNameChars(), c) >= 0 ? '_' : c); + return sb.ToString(); + } +} From 85a80043892ecc1aea778e81ea6f22c70dddd2d3 Mon Sep 17 00:00:00 2001 From: mtinti Date: Sat, 20 Jun 2026 11:00:10 +0100 Subject: [PATCH 2/3] Add in-memory round-trip test for the cohort scripting commands TestCohortScriptRoundTrip builds a cohort in the in-memory MemoryDataExportRepository (set operation, nested AND/OR filter containers, an aggregate-level parameter and a patient index table with an Inner join-use), runs ExportCohortAsScript then BuildCohortFromScript, and asserts the rebuilt object graph matches the original. No database required. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../TestCohortScriptRoundTrip.cs | 360 ++++++++++++++++++ 1 file changed, 360 insertions(+) create mode 100644 Rdmp.Core.Tests/CommandExecution/TestCohortScriptRoundTrip.cs diff --git a/Rdmp.Core.Tests/CommandExecution/TestCohortScriptRoundTrip.cs b/Rdmp.Core.Tests/CommandExecution/TestCohortScriptRoundTrip.cs new file mode 100644 index 0000000000..8669f2ec08 --- /dev/null +++ b/Rdmp.Core.Tests/CommandExecution/TestCohortScriptRoundTrip.cs @@ -0,0 +1,360 @@ +// Copyright (c) The University of Dundee 2018-2019 +// This file is part of the Research Data Management Platform (RDMP). +// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// You should have received a copy of the GNU General Public License along with RDMP. If not, see . + +using System.IO; +using System.Linq; +using NUnit.Framework; +using Rdmp.Core.CommandExecution.AtomicCommands.CohortScript; +using Rdmp.Core.CommandLine.Interactive; +using Rdmp.Core.Curation.Data; +using Rdmp.Core.Curation.Data.Aggregation; +using Rdmp.Core.Curation.Data.Cohort; +using Rdmp.Core.Curation.Data.Cohort.Joinables; +using Tests.Common; + +namespace Rdmp.Core.Tests.CommandExecution; + +/// +/// Round-trips a entirely in-memory: +/// build a cohort, ExportCohortAsScript it to a build.script.yaml, BuildCohortFromScript a +/// fresh copy, and assert the rebuilt object graph matches. Exercises set operations, nested +/// AND/OR filter containers, aggregate-level parameters and patient index tables (joinables) - +/// all without a database (uses the in-memory MemoryDataExportRepository). +/// +internal class TestCohortScriptRoundTrip : UnitTests +{ + [Test] + public void ExportThenRebuild_ComplexCohort_ProducesEquivalentGraph() + { + // --- arrange: three cohort-ready catalogues (each has a patient identifier + a date col) + var cataA = MakeCohortCatalogue("CataA", out var idA, out _); + var cataB = MakeCohortCatalogue("CataB", out var idB, out _); + var cataC = MakeCohortCatalogue("CataC", out var idC, out var dateC); + // a catalogue the cohort does NOT use - must be excluded from the scoped manifest + MakeCohortCatalogue("CataUnused", out _, out _); + + // --- arrange: the original cohort + var cic = new CohortIdentificationConfiguration(Repository, "RoundTrip Original"); + var root = new CohortAggregateContainer(Repository, SetOperation.INTERSECT); + cic.RootCohortAggregateContainer_ID = root.ID; + cic.SaveToDatabase(); + + // a cohort-level (global) parameter + new AnyTableSqlParameter(Repository, cic, "DECLARE @studyStart as datetime2") { Value = "'2019-01-01'" } + .SaveToDatabase(); + + // a patient index table on CataC (exposes the date column), joined by set B below. + // The PIT carries its OWN parameter and filter (must round-trip too). + var pit = new AggregateConfiguration(Repository, cataC, "PIT on C"); + _ = new AggregateDimension(Repository, idC, pit); + // the date column is ALIASED - the join filter would reference ix.event_date, so the + // alias must round-trip (and the dimension must not be dropped because its runtime name + // no longer matches any ExtractionInformation) + var pitDateDim = new AggregateDimension(Repository, dateC, pit) { Alias = "event_date" }; + pitDateDim.SaveToDatabase(); + cic.EnsureNamingConvention(pit); + pit.HavingSQL = "count(*) >= 1"; + new AnyTableSqlParameter(Repository, pit, "DECLARE @minStay as int") { Value = "2" }.SaveToDatabase(); + var pitFc = new AggregateFilterContainer(Repository, FilterContainerOperation.AND); + pit.RootFilterContainer_ID = pitFc.ID; + pit.SaveToDatabase(); + new AggregateFilter(Repository, "EmergencyOnly", pitFc) { WhereSQL = "admission_type = 'E'" }.SaveToDatabase(); + var joinable = new JoinableCohortAggregateConfiguration(Repository, cic, pit); + + // set A: CataA with a nested filter tree (DateFilter AND (CatA OR CatB)) + var aggA = new AggregateConfiguration(Repository, cataA, "People in A"); + _ = new AggregateDimension(Repository, idA, aggA); + cic.EnsureNamingConvention(aggA); + root.AddChild(aggA, 0); + var fcA = new AggregateFilterContainer(Repository, FilterContainerOperation.AND); + aggA.RootFilterContainer_ID = fcA.ID; + aggA.SaveToDatabase(); + _ = new AggregateFilter(Repository, "DateFilter", fcA) { WhereSQL = "MyDateCol >= '2020-01-01'" }; + var orSub = new AggregateFilterContainer(Repository, FilterContainerOperation.OR); + fcA.AddChild(orSub); + _ = new AggregateFilter(Repository, "CatA", orSub) { WhereSQL = "MyOtherCol LIKE 'A%'" }; + // a DISABLED filter: kept in the tree but excluded from the WHERE clause + _ = new AggregateFilter(Repository, "CatB", orSub) { WhereSQL = "MyOtherCol LIKE 'B%'", IsDisabled = true }; + foreach (var f in fcA.GetFilters().Concat(orSub.GetFilters())) + ((AggregateFilter)f).SaveToDatabase(); + // ...and the OR sub-container itself is DISABLED too + orSub.IsDisabled = true; + orSub.SaveToDatabase(); + // a filter that references a parameter. It is marked as imported from a published filter + // (ClonedFromExtractionFilter_ID) but its WhereSQL has diverged from that master - the + // export must reproduce this ACTUAL text + parameter, not re-import the (drifted) master. + var codeFilter = new AggregateFilter(Repository, "CodeList", fcA) + { + WhereSQL = "MyOtherCol IN (@codes)", + ClonedFromExtractionFilter_ID = 999 // a published filter whose current text differs + }; + codeFilter.SaveToDatabase(); + var codeParam = (AggregateFilterParameter)codeFilter.GetFilterFactory() + .CreateNewParameter(codeFilter, "DECLARE @codes AS varchar(50)"); + codeParam.Value = "'A1','B2'"; + codeParam.SaveToDatabase(); + + // set B: CataB with an aggregate-level parameter and a join-use to the index table + var aggB = new AggregateConfiguration(Repository, cataB, "People in B"); + _ = new AggregateDimension(Repository, idB, aggB); + cic.EnsureNamingConvention(aggB); + root.AddChild(aggB, 1); + aggB.HavingSQL = "count(*) >= 2"; // patients appearing at least twice + aggB.SaveToDatabase(); + new AnyTableSqlParameter(Repository, aggB, "DECLARE @window as int") { Value = "90" }.SaveToDatabase(); + var use = joinable.AddUser(aggB); + use.JoinType = ExtractionJoinType.Inner; + use.SaveToDatabase(); + + // a forced join on set B (force a table into its query) + var forcedTable = WhenIHaveA(); + forcedTable.Name = "[ForcedDb]..[ForcedTable]"; + forcedTable.SaveToDatabase(); + Repository.AggregateForcedJoinManager.CreateLinkBetween(aggB, forcedTable); + + // a DISABLED sub-container (excluded from the query but kept in the tree), holding a set + var cataD = MakeCohortCatalogue("CataD", out var idD, out _); + var disabledBranch = new CohortAggregateContainer(Repository, SetOperation.UNION) + { + Name = "Optional extras" // custom container name - must round-trip + }; + disabledBranch.SaveToDatabase(); + root.AddChild(disabledBranch); + var aggD = new AggregateConfiguration(Repository, cataD, "People in D"); + _ = new AggregateDimension(Repository, idD, aggD); + cic.EnsureNamingConvention(aggD); + disabledBranch.AddChild(aggD, 0); + disabledBranch.IsDisabled = true; + disabledBranch.SaveToDatabase(); + + // ...and a DISABLED direct set + aggA.IsDisabled = true; + aggA.SaveToDatabase(); + + // --- act: export to a script, then rebuild a fresh cohort from it + var activator = (ConsoleInputManager)GetActivator(); + activator.DisallowInput = true; + + var outDir = new DirectoryInfo(Path.Join(Path.GetTempPath(), "rdmp-cohortscript-" + System.Guid.NewGuid())); + new ExecuteCommandExportCohortAsScript(activator, cic, outDir).Execute(); + + var scriptFile = new FileInfo(Path.Join(outDir.FullName, "RoundTrip Original", "build.script.yaml")); + Assert.That(scriptFile.Exists, $"export did not produce {scriptFile.FullName}"); + + new ExecuteCommandBuildCohortFromScript(activator, scriptFile, "RoundTrip Rebuilt").Execute(); + + // --- assert: the rebuilt cohort matches the original, structurally + var rebuilt = Repository.GetAllObjectsWhere("Name", "RoundTrip Rebuilt") + .Single(); + var rebuiltRoot = rebuilt.RootCohortAggregateContainer; + + Assert.That(rebuiltRoot.Operation, Is.EqualTo(SetOperation.INTERSECT)); + + var sets = rebuiltRoot.GetAggregateConfigurations(); + Assert.That(sets.Select(a => a.Catalogue.Name).OrderBy(x => x), + Is.EquivalentTo(new[] { "CataA", "CataB" })); + + // set A: nested filter tree round-tripped + var rA = sets.Single(a => a.Catalogue.Name == "CataA"); + var rootFc = rA.RootFilterContainer; + Assert.Multiple(() => + { + Assert.That(rootFc.Operation, Is.EqualTo(FilterContainerOperation.AND)); + Assert.That(rootFc.GetFilters().Select(f => f.WhereSQL), + Is.EquivalentTo(new[] { "MyDateCol >= '2020-01-01'", "MyOtherCol IN (@codes)" })); + Assert.That(rootFc.GetSubContainers(), Has.Length.EqualTo(1)); + }); + + // the hand-written filter's parameter (@codes) was re-created with its value + var rCodeFilter = rootFc.GetFilters().Single(f => f.WhereSQL == "MyOtherCol IN (@codes)"); + var rCodeParams = rCodeFilter.GetAllParameters().OfType().ToArray(); + Assert.Multiple(() => + { + Assert.That(rCodeParams.Select(p => p.ParameterName), Has.Member("@codes")); + Assert.That(rCodeParams.Single(p => p.ParameterName == "@codes").Value, Is.EqualTo("'A1','B2'")); + }); + var sub = rootFc.GetSubContainers().Single(); + Assert.Multiple(() => + { + Assert.That(sub.Operation, Is.EqualTo(FilterContainerOperation.OR)); + Assert.That(sub.GetFilters().Select(f => f.WhereSQL), + Is.EquivalentTo(new[] { "MyOtherCol LIKE 'A%'", "MyOtherCol LIKE 'B%'" })); + }); + + // disabled FILTER and disabled filter SUB-CONTAINER round-tripped (both are excluded from + // the WHERE clause when disabled - losing the flag would silently change membership) + Assert.Multiple(() => + { + Assert.That(((AggregateFilterContainer)sub).IsDisabled, Is.True, + "disabled filter sub-container should round-trip disabled"); + Assert.That(((AggregateFilterContainer)rootFc).IsDisabled, Is.False, + "enabled root filter container should stay enabled"); + Assert.That(((AggregateFilter)sub.GetFilters().Single(f => f.Name == "CatB")).IsDisabled, Is.True, + "disabled filter should round-trip disabled"); + Assert.That(((AggregateFilter)sub.GetFilters().Single(f => f.Name == "CatA")).IsDisabled, Is.False, + "enabled filter should stay enabled"); + }); + + // set B: aggregate parameter and join-use round-tripped + var rB = sets.Single(a => a.Catalogue.Name == "CataB"); + var bParams = Repository.GetAllObjects() + .Where(p => p.ReferencedObjectType == nameof(AggregateConfiguration) && p.ReferencedObjectID == rB.ID) + .ToArray(); + Assert.Multiple(() => + { + Assert.That(bParams, Has.Length.EqualTo(1)); + Assert.That(bParams[0].ParameterName, Is.EqualTo("@window")); + Assert.That(bParams[0].Value, Is.EqualTo("90")); + }); + + // HAVING clause round-tripped (changes which patients the set matches) + Assert.That(rB.HavingSQL, Is.EqualTo("count(*) >= 2")); + + // patient index table round-tripped (one joinable, used by set B with an Inner join) + var joinables = rebuilt.GetAllJoinables(); + Assert.That(joinables, Has.Length.EqualTo(1)); + var usedBy = rB.PatientIndexJoinablesUsed; + Assert.Multiple(() => + { + Assert.That(usedBy, Has.Length.EqualTo(1)); + Assert.That(usedBy[0].JoinType, Is.EqualTo(ExtractionJoinType.Inner)); + Assert.That(usedBy[0].JoinableCohortAggregateConfiguration_ID, Is.EqualTo(joinables[0].ID)); + }); + + // disabled states round-tripped: the disabled direct set, and the disabled sub-container + // (with its set still inside it). Enabled set B stays enabled. + var rDisabledBranch = rebuiltRoot.GetSubContainers().Single(); + Assert.Multiple(() => + { + Assert.That(rA.IsDisabled, Is.True, "disabled set A should round-trip disabled"); + Assert.That(rB.IsDisabled, Is.False, "enabled set B should stay enabled"); + Assert.That(rDisabledBranch.IsDisabled, Is.True, "disabled sub-container should round-trip disabled"); + Assert.That(rDisabledBranch.GetAggregateConfigurations().Single().Catalogue.Name, Is.EqualTo("CataD")); + }); + + // container NAMES round-tripped: the custom name, and the root's name (a rebuilt root is + // auto-named "Root Container" - the original's name must win) + Assert.Multiple(() => + { + Assert.That(rDisabledBranch.Name, Is.EqualTo("Optional extras"), "custom container name should round-trip"); + Assert.That(rebuiltRoot.Name, Is.EqualTo(root.Name), "root container name should round-trip"); + }); + + // global (cohort-level) parameter round-tripped + var globals = rebuilt.GetAllParameters(); + Assert.Multiple(() => + { + Assert.That(globals.Select(p => p.ParameterName), Has.Member("@studyStart")); + Assert.That(globals.Single(p => p.ParameterName == "@studyStart").Value, Is.EqualTo("'2019-01-01'")); + }); + + // the patient index table's OWN parameter and filter round-tripped + var rebuiltPit = joinables[0].AggregateConfiguration; + var pitParams = Repository.GetAllObjects() + .Where(p => p.ReferencedObjectType == nameof(AggregateConfiguration) && p.ReferencedObjectID == rebuiltPit.ID) + .ToArray(); + Assert.Multiple(() => + { + Assert.That(pitParams.Select(p => p.ParameterName), Has.Member("@minStay")); + Assert.That(rebuiltPit.RootFilterContainer, Is.Not.Null, "PIT filter container should round-trip"); + Assert.That(rebuiltPit.RootFilterContainer.GetFilters().Select(f => f.WhereSQL), + Is.EquivalentTo(new[] { "admission_type = 'E'" })); + }); + + // the PIT's HAVING clause and its ALIASED date dimension round-tripped (the aliased + // dimension must exist - it was previously dropped because its runtime name didn't match + // any ExtractionInformation - and carry the alias the join filter references) + Assert.That(rebuiltPit.HavingSQL, Is.EqualTo("count(*) >= 1")); + var rPitDims = rebuiltPit.AggregateDimensions; + Assert.Multiple(() => + { + Assert.That(rPitDims, Has.Length.EqualTo(2), "PIT should have identifier + aliased date dimension"); + Assert.That(rPitDims.Select(d => d.GetRuntimeName()), Has.Member("event_date")); + Assert.That(rPitDims.Single(d => d.GetRuntimeName() == "event_date").ExtractionInformation.GetRuntimeName(), + Is.EqualTo("MyDateCol"), "alias should sit on the original underlying column"); + }); + + // forced join round-tripped on set B + Assert.That(rB.ForcedJoins.Select(t => t.Name), Has.Member("[ForcedDb]..[ForcedTable]")); + + // catalogue-manifest.yaml is scoped to ONLY the catalogues this cohort uses + var manifest = File.ReadAllText(Path.Join(outDir.FullName, "RoundTrip Original", "catalogue-manifest.yaml")); + Assert.Multiple(() => + { + Assert.That(manifest, Does.Contain("CataA")); + Assert.That(manifest, Does.Contain("CataB")); + Assert.That(manifest, Does.Contain("CataC")); + Assert.That(manifest, Does.Contain("CataD")); + Assert.That(manifest, Does.Not.Contain("CataUnused"), "manifest must exclude catalogues the cohort doesn't use"); + }); + + outDir.Delete(true); + } + + /// + /// When the whole-cohort SQL cannot be generated (in-memory there is no real server, which is + /// the same failure a real cross-server cohort with no QueryCache hits), query.sql must fall + /// back to a best-effort document: a header, each cohort set listed, and a notes footer - + /// instead of losing everything to one "SQL generation failed" line. + /// + [Test] + public void Export_WhenFullSqlCannotBeGenerated_WritesBestEffortQuery() + { + var cata1 = MakeCohortCatalogue("BE_One", out var id1, out _); + var cata2 = MakeCohortCatalogue("BE_Two", out var id2, out _); + + var cic = new CohortIdentificationConfiguration(Repository, "BestEffort CIC"); + var root = new CohortAggregateContainer(Repository, SetOperation.UNION); + cic.RootCohortAggregateContainer_ID = root.ID; + cic.SaveToDatabase(); + + var s1 = new AggregateConfiguration(Repository, cata1, "People in One"); + _ = new AggregateDimension(Repository, id1, s1); + cic.EnsureNamingConvention(s1); + root.AddChild(s1, 0); + + var s2 = new AggregateConfiguration(Repository, cata2, "People in Two"); + _ = new AggregateDimension(Repository, id2, s2); + cic.EnsureNamingConvention(s2); + root.AddChild(s2, 1); + + var activator = (ConsoleInputManager)GetActivator(); + activator.DisallowInput = true; + var outDir = new DirectoryInfo(Path.Join(Path.GetTempPath(), "rdmp-besteffort-" + System.Guid.NewGuid())); + new ExecuteCommandExportCohortAsScript(activator, cic, outDir).Execute(); + + var sql = File.ReadAllText(Path.Join(outDir.FullName, "BestEffort CIC", "query.sql")); + Assert.Multiple(() => + { + Assert.That(sql, Does.Contain("BEST-EFFORT"), "should fall back to a best-effort document"); + Assert.That(sql, Does.Contain("People in One"), "each cohort set should be listed"); + Assert.That(sql, Does.Contain("People in Two")); + Assert.That(sql, Does.Contain("UNION"), "the set operation joining the sets should be shown"); + Assert.That(sql, Does.Contain("COULD NOT BE CONVERTED"), "should end with the could-not-convert notes"); + }); + + outDir.Delete(true); + } + + /// + /// A catalogue with a patient-identifier column () and a date column + /// (), suitable for use as a cohort identification set. + /// + private Catalogue MakeCohortCatalogue(string name, out ExtractionInformation idEi, out ExtractionInformation dateEi) + { + var throwaway = WhenIHaveA(Repository, out dateEi, out var otherEi); + var cata = throwaway.Catalogue; + cata.Name = name; + cata.SaveToDatabase(); + + otherEi.IsExtractionIdentifier = true; + otherEi.SaveToDatabase(); + idEi = otherEi; + + throwaway.DeleteInDatabase(); // we only wanted the catalogue + its two ExtractionInformations + return cata; + } +} From 1701d3c873bb061874f32c847c4a7b22f2562c6d Mon Sep 17 00:00:00 2001 From: mtinti Date: Sat, 20 Jun 2026 11:00:10 +0100 Subject: [PATCH 3/3] Document ExportCohortAsScript / BuildCohortFromScript in the CLI tutorial Adds an "Exporting and rebuilding a cohort" section to RdmpCommandLine.md covering both commands and what the build script captures. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../CodeTutorials/RdmpCommandLine.md | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/Documentation/CodeTutorials/RdmpCommandLine.md b/Documentation/CodeTutorials/RdmpCommandLine.md index b0ef24ca4e..8f08bad5f4 100644 --- a/Documentation/CodeTutorials/RdmpCommandLine.md +++ b/Documentation/CodeTutorials/RdmpCommandLine.md @@ -109,6 +109,41 @@ Some commands require specifying a database (e.g. `CreateNewCatalogueByImporting See [the technical documentation](../../Rdmp.Core/CommandLine/Runners/ExecuteCommandRunner.md) for how this parsing occurs in code. +### Exporting and rebuilding a cohort + +A [CohortIdentificationConfiguration] can be exported as a portable, data-free script and rebuilt +later (e.g. on another RDMP instance). No patient data is written - only catalogue/table/column +names and the cohort's filter logic. + +Export a cohort to a folder: + +``` +./rdmp ExportCohortAsScript CohortIdentificationConfiguration:12 ./out +``` +*Writes `./out//` containing `build.script.yaml` (a runnable command script that +recreates the cohort), `query.sql` (the SQL RDMP would run), `catalogue-manifest.yaml` (the +extractable columns, patient-identifier column(s) and published filters of just the catalogues +this cohort uses) and a `requirement.md` placeholder.* + +If the cohort cannot be expressed as a single query - typically because its sets span multiple +servers/credentials and no QueryCache is configured (RDMP cannot `UNION`/`INTERSECT`/`EXCEPT` +across servers) - `query.sql` falls back to a *best-effort* document: each cohort set's SQL is +emitted individually with the set-operation tree shown as comments, and a notes block at the end +lists what could not be combined. Each per-set query is valid against its own server. + +Rebuild an identical cohort from that script: + +``` +./rdmp BuildCohortFromScript ./out//build.script.yaml "My rebuilt cohort" +``` +*Replays the script to create a new `CohortIdentificationConfiguration` named "My rebuilt cohort".* + +The script captures set operations, nested cohort sub-containers (with their names and order), +imported and hand-written filters with their parameters, global (cohort-level) and aggregate-level +parameters, nested AND/OR filter containers, HAVING clauses, patient index tables (joinables) with +their joins/filters/parameters and column aliases, forced joins, disabled sets, sub-containers and +filters, and the Project association (if any). + ## Terminal GUI You can access an interactive terminal similar to the RDMP gui client by running: @@ -135,3 +170,4 @@ For a selection of example scripts see the [scripts folder](../../scripts/) [Pipeline]: ./Glossary.md#Pipeline [Catalogue]: ./Glossary.md#Catalogue +[CohortIdentificationConfiguration]: ./Glossary.md#CohortIdentificationConfiguration