Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions JS/edgechains/arakoodev/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
],
"exports": {
"./ai": "./dist/ai/src/index.js",
"./comprehend": "./dist/comprehend/src/index.js",
"./vector-db": "./dist/vector-db/src/index.js",
"./document-loader": "./dist/document-loader/src/index.js",
"./splitter": "./dist/splitter/src/index.js",
Expand All @@ -22,6 +23,7 @@
"test": "vitest"
},
"dependencies": {
"@aws-sdk/client-comprehend": "^3.700.0",
"@babel/core": "^7.24.4",
"@babel/preset-env": "^7.24.4",
"@hono/node-server": "^0.6.0",
Expand Down
1 change: 1 addition & 0 deletions JS/edgechains/arakoodev/src/comprehend/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export { Comprehend } from "./lib/comprehend/comprehend.js";
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import {
ComprehendClient,
DetectPiiEntitiesCommand,
PiiEntity,
LanguageCode,
} from "@aws-sdk/client-comprehend";

interface ComprehendConstructionOptions {
accessKeyId?: string;
secretAccessKey?: string;
region?: string;
}

interface RedactOptions {
/** Language of the text. Defaults to "en". */
languageCode?: LanguageCode;
/**
* Builds the replacement string for a detected entity. Receives the PII
* type (e.g. "EMAIL", "NAME") and returns the mask. Defaults to `[TYPE]`.
*/
mask?: (type: string) => string;
}

/**
* Comprehend wraps Amazon Comprehend's PII detection so sensitive data can be
* stripped from a prompt before it is chained into an Endpoint class
* (OpenAI, GeminiAI, LlamaAI, ...).
*
* const comprehend = new Comprehend();
* const safePrompt = await comprehend.redact(userInput);
* const answer = await openai.chat({ prompt: safePrompt });
*/
export class Comprehend {
private client: ComprehendClient;

constructor(options: ComprehendConstructionOptions = {}) {
const region = options.region || process.env.AWS_REGION || "us-east-1";
const accessKeyId = options.accessKeyId || process.env.AWS_ACCESS_KEY_ID;
const secretAccessKey = options.secretAccessKey || process.env.AWS_SECRET_ACCESS_KEY;

this.checkKeys(accessKeyId, secretAccessKey);

this.client = new ComprehendClient({
region,
// When the keys are absent the SDK falls back to the default
// provider chain (env, shared config, IAM role, ...).
credentials:
accessKeyId && secretAccessKey ? { accessKeyId, secretAccessKey } : undefined,
});
}

private checkKeys(accessKeyId?: string, secretAccessKey?: string): void {
if (!accessKeyId || !secretAccessKey) {
console.warn(
"AWS credentials are missing. Provide them in the constructor or as " +
"AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY. The default AWS provider " +
"chain will be used instead."
);
}
}

/**
* Returns the raw PII entities Amazon Comprehend detects in `text`.
*/
async detectPii(text: string, languageCode: LanguageCode = "en"): Promise<PiiEntity[]> {
if (!text) return [];
const response = await this.client.send(
new DetectPiiEntitiesCommand({ Text: text, LanguageCode: languageCode })
);
return response.Entities ?? [];
}

/**
* Detects PII in `text` and replaces every entity with a mask, returning a
* redacted copy of the string. Safe to chain straight into an Endpoint.
*/
async redact(text: string, options: RedactOptions = {}): Promise<string> {
const entities = await this.detectPii(text, options.languageCode ?? "en");
return Comprehend.applyRedaction(text, entities, options.mask);
}

/**
* Pure replacement step. Exposed (static) so the offset handling can be
* unit-tested without hitting AWS. Entities are applied from the end of the
* string backwards so earlier replacements never shift later offsets.
*/
static applyRedaction(
text: string,
entities: PiiEntity[],
mask: (type: string) => string = (type) => `[${type}]`
): string {
if (!text || entities.length === 0) return text;

const ordered = [...entities].sort(
(a, b) => (b.BeginOffset ?? 0) - (a.BeginOffset ?? 0)
);

let redacted = text;
for (const entity of ordered) {
const { BeginOffset: begin, EndOffset: end, Type: type } = entity;
if (begin == null || end == null || begin < 0 || end > text.length || begin >= end) {
continue;
}
redacted = redacted.slice(0, begin) + mask(type ?? "PII") + redacted.slice(end);
}
return redacted;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { Comprehend } from "../../../../../dist/comprehend/src/lib/comprehend/comprehend.js";
import { ComprehendClient } from "@aws-sdk/client-comprehend";

jest.mock("@aws-sdk/client-comprehend", () => ({
ComprehendClient: jest.fn().mockImplementation(() => ({ send: jest.fn() })),
DetectPiiEntitiesCommand: jest.fn().mockImplementation((input) => ({ input })),
}));

describe("Comprehend", () => {
describe("applyRedaction (offset handling)", () => {
it("replaces a single entity with the default [TYPE] mask", () => {
const text = "Email me at john@doe.com please";
const entities = [{ Type: "EMAIL", BeginOffset: 12, EndOffset: 24, Score: 0.99 }];
expect(Comprehend.applyRedaction(text, entities)).toBe("Email me at [EMAIL] please");
});

it("replaces multiple entities without shifting later offsets", () => {
const text = "John lives at 5th Ave and his ssn is 111-22-3333";
const entities = [
{ Type: "NAME", BeginOffset: 0, EndOffset: 4 },
{ Type: "ADDRESS", BeginOffset: 14, EndOffset: 21 },
{ Type: "SSN", BeginOffset: 37, EndOffset: 48 },
];
expect(Comprehend.applyRedaction(text, entities)).toBe(
"[NAME] lives at [ADDRESS] and his ssn is [SSN]"
);
});

it("supports a custom mask function", () => {
const text = "call 555-0100";
const entities = [{ Type: "PHONE", BeginOffset: 5, EndOffset: 13 }];
const masked = Comprehend.applyRedaction(text, entities, () => "***");
expect(masked).toBe("call ***");
});

it("returns the text untouched when there are no entities", () => {
expect(Comprehend.applyRedaction("nothing private here", [])).toBe(
"nothing private here"
);
});

it("ignores entities with invalid or out-of-range offsets", () => {
const text = "safe text";
const entities = [
{ Type: "NAME", BeginOffset: 5, EndOffset: 2 }, // begin >= end
{ Type: "NAME", BeginOffset: -1, EndOffset: 3 }, // negative begin
{ Type: "NAME", BeginOffset: 0, EndOffset: 999 }, // end past length
];
expect(Comprehend.applyRedaction(text, entities)).toBe("safe text");
});
});

describe("redact (chains detection + redaction)", () => {
it("calls Comprehend and redacts the detected PII", async () => {
const comprehend = new Comprehend({ accessKeyId: "x", secretAccessKey: "y" });
// Inject the mocked send response.
(comprehend as any).client.send = jest.fn().mockResolvedValueOnce({
Entities: [{ Type: "EMAIL", BeginOffset: 9, EndOffset: 21, Score: 0.99 }],
});

const result = await comprehend.redact("contact me@example.org now");
expect(result).toBe("contact [EMAIL] now");
});

it("returns empty string for empty input without calling AWS", async () => {
const comprehend = new Comprehend({ accessKeyId: "x", secretAccessKey: "y" });
const send = jest.fn();
(comprehend as any).client.send = send;

const result = await comprehend.redact("");
expect(result).toBe("");
expect(send).not.toHaveBeenCalled();
});
});
});
18 changes: 18 additions & 0 deletions JS/edgechains/examples/redact-pii-with-comprehend/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "redact-pii-with-comprehend",
"version": "1.0.0",
"description": "Redact PII with Amazon Comprehend before chaining a prompt into an LLM endpoint.",
"main": "dist/index.js",
"type": "module",
"scripts": {
"start": "tsc && node ./dist/index.js"
},
"license": "ISC",
"dependencies": {
"@arakoodev/edgechains.js": "file:../../arakoodev"
},
"devDependencies": {
"@types/node": "^20.17.2",
"typescript": "^5.6.3"
}
}
48 changes: 48 additions & 0 deletions JS/edgechains/examples/redact-pii-with-comprehend/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Redact PII with Amazon Comprehend

Strip personally identifiable information (PII) out of a user prompt with
[Amazon Comprehend](https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html)
before chaining it into an LLM endpoint such as `OpenAI`.

```ts
import { Comprehend } from "@arakoodev/edgechains.js/comprehend";
import { OpenAI } from "@arakoodev/edgechains.js/ai";

const comprehend = new Comprehend();
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

const safePrompt = await comprehend.redact("My email is jane@doe.com");
// -> "My email is [EMAIL]"

const answer = await openai.chat({ prompt: safePrompt });
```

## Run

```bash
npm install
export AWS_ACCESS_KEY_ID=...
export AWS_SECRET_ACCESS_KEY=...
export AWS_REGION=us-east-1
export OPENAI_API_KEY=...
npm start
```

## API

### `new Comprehend(options?)`

| option | env fallback | default |
| ----------------- | ----------------------- | ------------- |
| `accessKeyId` | `AWS_ACCESS_KEY_ID` | — |
| `secretAccessKey` | `AWS_SECRET_ACCESS_KEY` | — |
| `region` | `AWS_REGION` | `us-east-1` |

### `comprehend.redact(text, { languageCode?, mask? })`

Detects PII and returns a redacted copy of `text`. Each entity is replaced by
`mask(type)` (default `[TYPE]`, e.g. `[EMAIL]`).

### `comprehend.detectPii(text, languageCode?)`

Returns the raw `PiiEntity[]` Amazon Comprehend reports.
32 changes: 32 additions & 0 deletions JS/edgechains/examples/redact-pii-with-comprehend/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { Comprehend } from "@arakoodev/edgechains.js/comprehend";
import { OpenAI } from "@arakoodev/edgechains.js/ai";

/**
* Redact PII with Amazon Comprehend before chaining the prompt into an LLM.
*
* Env vars required:
* AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION (Comprehend)
* OPENAI_API_KEY (OpenAI endpoint)
*/
async function main() {
const comprehend = new Comprehend();
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

const userPrompt =
"My name is Jane Doe, my email is jane.doe@example.com and my phone is 555-0142. " +
"Please draft a short note asking support to reset my password.";

// 1. Strip sensitive data out of the prompt.
const safePrompt = await comprehend.redact(userPrompt);
console.log("Original :", userPrompt);
console.log("Redacted :", safePrompt);

// 2. Chain the redacted prompt straight into the OpenAI endpoint.
const answer = await openai.chat({ prompt: safePrompt });
console.log("LLM reply:", answer);
}

main().catch((err) => {
console.error(err);
process.exit(1);
});
13 changes: 13 additions & 0 deletions JS/edgechains/examples/redact-pii-with-comprehend/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "NodeNext",
"moduleResolution": "NodeNext",
"esModuleInterop": true,
"strict": true,
"skipLibCheck": true,
"outDir": "./dist",
"rootDir": "./src"
},
"include": ["src"]
}
Loading