From 6f2cb3c29eaadfaee77e8a861bc1480cb2f42da7 Mon Sep 17 00:00:00 2001 From: EazyHood <209367218+EazyHood@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:40:33 -0500 Subject: [PATCH] feat(comprehend): add Amazon Comprehend PII redaction utility Adds a Comprehend class that detects PII via Amazon Comprehend (DetectPiiEntities) and returns a redacted prompt, ready to chain into existing Endpoint classes (OpenAI, GeminiAI, ...). - arakoodev/src/comprehend: Comprehend.redact() / detectPii() + static applyRedaction() with safe reverse-offset replacement - tests covering offset handling, custom masks, empty/invalid input - examples/redact-pii-with-comprehend: redaction -> OpenAI chain demo - register ./comprehend export and @aws-sdk/client-comprehend dep Closes #290 --- JS/edgechains/arakoodev/package.json | 2 + .../arakoodev/src/comprehend/src/index.ts | 1 + .../src/lib/comprehend/comprehend.ts | 108 ++++++++++++++++++ .../src/tests/comprehend/comprehend.test.ts | 75 ++++++++++++ .../redact-pii-with-comprehend/package.json | 18 +++ .../redact-pii-with-comprehend/readme.md | 48 ++++++++ .../redact-pii-with-comprehend/src/index.ts | 32 ++++++ .../redact-pii-with-comprehend/tsconfig.json | 13 +++ 8 files changed, 297 insertions(+) create mode 100644 JS/edgechains/arakoodev/src/comprehend/src/index.ts create mode 100644 JS/edgechains/arakoodev/src/comprehend/src/lib/comprehend/comprehend.ts create mode 100644 JS/edgechains/arakoodev/src/comprehend/src/tests/comprehend/comprehend.test.ts create mode 100644 JS/edgechains/examples/redact-pii-with-comprehend/package.json create mode 100644 JS/edgechains/examples/redact-pii-with-comprehend/readme.md create mode 100644 JS/edgechains/examples/redact-pii-with-comprehend/src/index.ts create mode 100644 JS/edgechains/examples/redact-pii-with-comprehend/tsconfig.json diff --git a/JS/edgechains/arakoodev/package.json b/JS/edgechains/arakoodev/package.json index 0b0bd3784..12dc9eca0 100644 --- a/JS/edgechains/arakoodev/package.json +++ b/JS/edgechains/arakoodev/package.json @@ -7,6 +7,7 @@ ], "exports": { "./ai": "./dist/ai/src/index.js", + "./comprehend": "./dist/comprehend/src/index.js", "./vector-db": "./dist/vector-db/src/index.js", "./document-loader": "./dist/document-loader/src/index.js", "./splitter": "./dist/splitter/src/index.js", @@ -22,6 +23,7 @@ "test": "vitest" }, "dependencies": { + "@aws-sdk/client-comprehend": "^3.700.0", "@babel/core": "^7.24.4", "@babel/preset-env": "^7.24.4", "@hono/node-server": "^0.6.0", diff --git a/JS/edgechains/arakoodev/src/comprehend/src/index.ts b/JS/edgechains/arakoodev/src/comprehend/src/index.ts new file mode 100644 index 000000000..c9fe4f3a1 --- /dev/null +++ b/JS/edgechains/arakoodev/src/comprehend/src/index.ts @@ -0,0 +1 @@ +export { Comprehend } from "./lib/comprehend/comprehend.js"; diff --git a/JS/edgechains/arakoodev/src/comprehend/src/lib/comprehend/comprehend.ts b/JS/edgechains/arakoodev/src/comprehend/src/lib/comprehend/comprehend.ts new file mode 100644 index 000000000..f57a51772 --- /dev/null +++ b/JS/edgechains/arakoodev/src/comprehend/src/lib/comprehend/comprehend.ts @@ -0,0 +1,108 @@ +import { + ComprehendClient, + DetectPiiEntitiesCommand, + PiiEntity, + LanguageCode, +} from "@aws-sdk/client-comprehend"; + +interface ComprehendConstructionOptions { + accessKeyId?: string; + secretAccessKey?: string; + region?: string; +} + +interface RedactOptions { + /** Language of the text. Defaults to "en". */ + languageCode?: LanguageCode; + /** + * Builds the replacement string for a detected entity. Receives the PII + * type (e.g. "EMAIL", "NAME") and returns the mask. Defaults to `[TYPE]`. + */ + mask?: (type: string) => string; +} + +/** + * Comprehend wraps Amazon Comprehend's PII detection so sensitive data can be + * stripped from a prompt before it is chained into an Endpoint class + * (OpenAI, GeminiAI, LlamaAI, ...). + * + * const comprehend = new Comprehend(); + * const safePrompt = await comprehend.redact(userInput); + * const answer = await openai.chat({ prompt: safePrompt }); + */ +export class Comprehend { + private client: ComprehendClient; + + constructor(options: ComprehendConstructionOptions = {}) { + const region = options.region || process.env.AWS_REGION || "us-east-1"; + const accessKeyId = options.accessKeyId || process.env.AWS_ACCESS_KEY_ID; + const secretAccessKey = options.secretAccessKey || process.env.AWS_SECRET_ACCESS_KEY; + + this.checkKeys(accessKeyId, secretAccessKey); + + this.client = new ComprehendClient({ + region, + // When the keys are absent the SDK falls back to the default + // provider chain (env, shared config, IAM role, ...). + credentials: + accessKeyId && secretAccessKey ? { accessKeyId, secretAccessKey } : undefined, + }); + } + + private checkKeys(accessKeyId?: string, secretAccessKey?: string): void { + if (!accessKeyId || !secretAccessKey) { + console.warn( + "AWS credentials are missing. Provide them in the constructor or as " + + "AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY. The default AWS provider " + + "chain will be used instead." + ); + } + } + + /** + * Returns the raw PII entities Amazon Comprehend detects in `text`. + */ + async detectPii(text: string, languageCode: LanguageCode = "en"): Promise { + if (!text) return []; + const response = await this.client.send( + new DetectPiiEntitiesCommand({ Text: text, LanguageCode: languageCode }) + ); + return response.Entities ?? []; + } + + /** + * Detects PII in `text` and replaces every entity with a mask, returning a + * redacted copy of the string. Safe to chain straight into an Endpoint. + */ + async redact(text: string, options: RedactOptions = {}): Promise { + const entities = await this.detectPii(text, options.languageCode ?? "en"); + return Comprehend.applyRedaction(text, entities, options.mask); + } + + /** + * Pure replacement step. Exposed (static) so the offset handling can be + * unit-tested without hitting AWS. Entities are applied from the end of the + * string backwards so earlier replacements never shift later offsets. + */ + static applyRedaction( + text: string, + entities: PiiEntity[], + mask: (type: string) => string = (type) => `[${type}]` + ): string { + if (!text || entities.length === 0) return text; + + const ordered = [...entities].sort( + (a, b) => (b.BeginOffset ?? 0) - (a.BeginOffset ?? 0) + ); + + let redacted = text; + for (const entity of ordered) { + const { BeginOffset: begin, EndOffset: end, Type: type } = entity; + if (begin == null || end == null || begin < 0 || end > text.length || begin >= end) { + continue; + } + redacted = redacted.slice(0, begin) + mask(type ?? "PII") + redacted.slice(end); + } + return redacted; + } +} diff --git a/JS/edgechains/arakoodev/src/comprehend/src/tests/comprehend/comprehend.test.ts b/JS/edgechains/arakoodev/src/comprehend/src/tests/comprehend/comprehend.test.ts new file mode 100644 index 000000000..d0d2b34b3 --- /dev/null +++ b/JS/edgechains/arakoodev/src/comprehend/src/tests/comprehend/comprehend.test.ts @@ -0,0 +1,75 @@ +import { Comprehend } from "../../../../../dist/comprehend/src/lib/comprehend/comprehend.js"; +import { ComprehendClient } from "@aws-sdk/client-comprehend"; + +jest.mock("@aws-sdk/client-comprehend", () => ({ + ComprehendClient: jest.fn().mockImplementation(() => ({ send: jest.fn() })), + DetectPiiEntitiesCommand: jest.fn().mockImplementation((input) => ({ input })), +})); + +describe("Comprehend", () => { + describe("applyRedaction (offset handling)", () => { + it("replaces a single entity with the default [TYPE] mask", () => { + const text = "Email me at john@doe.com please"; + const entities = [{ Type: "EMAIL", BeginOffset: 12, EndOffset: 24, Score: 0.99 }]; + expect(Comprehend.applyRedaction(text, entities)).toBe("Email me at [EMAIL] please"); + }); + + it("replaces multiple entities without shifting later offsets", () => { + const text = "John lives at 5th Ave and his ssn is 111-22-3333"; + const entities = [ + { Type: "NAME", BeginOffset: 0, EndOffset: 4 }, + { Type: "ADDRESS", BeginOffset: 14, EndOffset: 21 }, + { Type: "SSN", BeginOffset: 37, EndOffset: 48 }, + ]; + expect(Comprehend.applyRedaction(text, entities)).toBe( + "[NAME] lives at [ADDRESS] and his ssn is [SSN]" + ); + }); + + it("supports a custom mask function", () => { + const text = "call 555-0100"; + const entities = [{ Type: "PHONE", BeginOffset: 5, EndOffset: 13 }]; + const masked = Comprehend.applyRedaction(text, entities, () => "***"); + expect(masked).toBe("call ***"); + }); + + it("returns the text untouched when there are no entities", () => { + expect(Comprehend.applyRedaction("nothing private here", [])).toBe( + "nothing private here" + ); + }); + + it("ignores entities with invalid or out-of-range offsets", () => { + const text = "safe text"; + const entities = [ + { Type: "NAME", BeginOffset: 5, EndOffset: 2 }, // begin >= end + { Type: "NAME", BeginOffset: -1, EndOffset: 3 }, // negative begin + { Type: "NAME", BeginOffset: 0, EndOffset: 999 }, // end past length + ]; + expect(Comprehend.applyRedaction(text, entities)).toBe("safe text"); + }); + }); + + describe("redact (chains detection + redaction)", () => { + it("calls Comprehend and redacts the detected PII", async () => { + const comprehend = new Comprehend({ accessKeyId: "x", secretAccessKey: "y" }); + // Inject the mocked send response. + (comprehend as any).client.send = jest.fn().mockResolvedValueOnce({ + Entities: [{ Type: "EMAIL", BeginOffset: 9, EndOffset: 21, Score: 0.99 }], + }); + + const result = await comprehend.redact("contact me@example.org now"); + expect(result).toBe("contact [EMAIL] now"); + }); + + it("returns empty string for empty input without calling AWS", async () => { + const comprehend = new Comprehend({ accessKeyId: "x", secretAccessKey: "y" }); + const send = jest.fn(); + (comprehend as any).client.send = send; + + const result = await comprehend.redact(""); + expect(result).toBe(""); + expect(send).not.toHaveBeenCalled(); + }); + }); +}); diff --git a/JS/edgechains/examples/redact-pii-with-comprehend/package.json b/JS/edgechains/examples/redact-pii-with-comprehend/package.json new file mode 100644 index 000000000..879d7c847 --- /dev/null +++ b/JS/edgechains/examples/redact-pii-with-comprehend/package.json @@ -0,0 +1,18 @@ +{ + "name": "redact-pii-with-comprehend", + "version": "1.0.0", + "description": "Redact PII with Amazon Comprehend before chaining a prompt into an LLM endpoint.", + "main": "dist/index.js", + "type": "module", + "scripts": { + "start": "tsc && node ./dist/index.js" + }, + "license": "ISC", + "dependencies": { + "@arakoodev/edgechains.js": "file:../../arakoodev" + }, + "devDependencies": { + "@types/node": "^20.17.2", + "typescript": "^5.6.3" + } +} diff --git a/JS/edgechains/examples/redact-pii-with-comprehend/readme.md b/JS/edgechains/examples/redact-pii-with-comprehend/readme.md new file mode 100644 index 000000000..a1a699300 --- /dev/null +++ b/JS/edgechains/examples/redact-pii-with-comprehend/readme.md @@ -0,0 +1,48 @@ +# Redact PII with Amazon Comprehend + +Strip personally identifiable information (PII) out of a user prompt with +[Amazon Comprehend](https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html) +before chaining it into an LLM endpoint such as `OpenAI`. + +```ts +import { Comprehend } from "@arakoodev/edgechains.js/comprehend"; +import { OpenAI } from "@arakoodev/edgechains.js/ai"; + +const comprehend = new Comprehend(); +const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); + +const safePrompt = await comprehend.redact("My email is jane@doe.com"); +// -> "My email is [EMAIL]" + +const answer = await openai.chat({ prompt: safePrompt }); +``` + +## Run + +```bash +npm install +export AWS_ACCESS_KEY_ID=... +export AWS_SECRET_ACCESS_KEY=... +export AWS_REGION=us-east-1 +export OPENAI_API_KEY=... +npm start +``` + +## API + +### `new Comprehend(options?)` + +| option | env fallback | default | +| ----------------- | ----------------------- | ------------- | +| `accessKeyId` | `AWS_ACCESS_KEY_ID` | — | +| `secretAccessKey` | `AWS_SECRET_ACCESS_KEY` | — | +| `region` | `AWS_REGION` | `us-east-1` | + +### `comprehend.redact(text, { languageCode?, mask? })` + +Detects PII and returns a redacted copy of `text`. Each entity is replaced by +`mask(type)` (default `[TYPE]`, e.g. `[EMAIL]`). + +### `comprehend.detectPii(text, languageCode?)` + +Returns the raw `PiiEntity[]` Amazon Comprehend reports. diff --git a/JS/edgechains/examples/redact-pii-with-comprehend/src/index.ts b/JS/edgechains/examples/redact-pii-with-comprehend/src/index.ts new file mode 100644 index 000000000..b120db8df --- /dev/null +++ b/JS/edgechains/examples/redact-pii-with-comprehend/src/index.ts @@ -0,0 +1,32 @@ +import { Comprehend } from "@arakoodev/edgechains.js/comprehend"; +import { OpenAI } from "@arakoodev/edgechains.js/ai"; + +/** + * Redact PII with Amazon Comprehend before chaining the prompt into an LLM. + * + * Env vars required: + * AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION (Comprehend) + * OPENAI_API_KEY (OpenAI endpoint) + */ +async function main() { + const comprehend = new Comprehend(); + const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); + + const userPrompt = + "My name is Jane Doe, my email is jane.doe@example.com and my phone is 555-0142. " + + "Please draft a short note asking support to reset my password."; + + // 1. Strip sensitive data out of the prompt. + const safePrompt = await comprehend.redact(userPrompt); + console.log("Original :", userPrompt); + console.log("Redacted :", safePrompt); + + // 2. Chain the redacted prompt straight into the OpenAI endpoint. + const answer = await openai.chat({ prompt: safePrompt }); + console.log("LLM reply:", answer); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/JS/edgechains/examples/redact-pii-with-comprehend/tsconfig.json b/JS/edgechains/examples/redact-pii-with-comprehend/tsconfig.json new file mode 100644 index 000000000..60cc462bf --- /dev/null +++ b/JS/edgechains/examples/redact-pii-with-comprehend/tsconfig.json @@ -0,0 +1,13 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "NodeNext", + "moduleResolution": "NodeNext", + "esModuleInterop": true, + "strict": true, + "skipLibCheck": true, + "outDir": "./dist", + "rootDir": "./src" + }, + "include": ["src"] +}