Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .changeset/fix-duplicate-sections.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@

Fix duplicate sections appearing when scanning repos with identical content across multiple files

Sections with the same title and content from different source files (e.g., shared README sections across package directories) are now deduplicated, keeping only the first occurrence.
Sections with the same content from different source files (e.g., shared README sections across package directories) are now deduplicated based on content only, keeping the first occurrence regardless of section title.
47 changes: 40 additions & 7 deletions packages/context/src/package-builder.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -149,25 +149,23 @@ Run the install command.
expect(result.path).toBe(testDbPath);
});

it("deduplicates sections with identical title and content from different files", () => {
it("deduplicates sections with identical content from different files", () => {
// Simulate the vercel/ai repo scenario where multiple README.md files
// have the same "Skill for Coding Agents" section
const sharedSection = `## Skill for Coding Agents

If you use coding agents such as Claude Code or Cursor, we highly recommend adding the AI SDK skill to your repository.`;
const sharedContent = `If you use coding agents such as Claude Code or Cursor, we highly recommend adding the AI SDK skill to your repository.`;

const files = [
{
path: "packages/deepseek/README.md",
content: `# DeepSeek Provider\n\n## Overview\n\nDeepSeek provider for the AI SDK.\n\n${sharedSection}`,
content: `# DeepSeek Provider\n\n## Overview\n\nDeepSeek provider for the AI SDK.\n\n## Skill for Coding Agents\n\n${sharedContent}`,
},
{
path: "packages/elevenlabs/README.md",
content: `# ElevenLabs Provider\n\n## Overview\n\nElevenLabs provider for the AI SDK.\n\n${sharedSection}`,
content: `# ElevenLabs Provider\n\n## Overview\n\nElevenLabs provider for the AI SDK.\n\n## Skill for Coding Agents\n\n${sharedContent}`,
},
{
path: "packages/fal/README.md",
content: `# Fal Provider\n\n## Overview\n\nFal provider for the AI SDK.\n\n${sharedSection}`,
content: `# Fal Provider\n\n## Overview\n\nFal provider for the AI SDK.\n\n## Skill for Coding Agents\n\n${sharedContent}`,
},
];

Expand Down Expand Up @@ -203,6 +201,41 @@ If you use coding agents such as Claude Code or Cursor, we highly recommend addi
expect(result.sectionCount).toBe(4);
});

it("deduplicates sections with same content but different titles", () => {
const sharedContent = `This is the shared installation instructions for all packages.`;

const files = [
{
path: "packages/a/README.md",
content: `# Package A\n\n## Getting Started\n\n${sharedContent}`,
},
{
path: "packages/b/README.md",
content: `# Package B\n\n## Installation\n\n${sharedContent}`,
},
];

buildPackage(testDbPath, files, {
name: "test-content-dedup",
version: "1.0.0",
});

const db = new Database(testDbPath, { readonly: true });
try {
// Content is identical, so only one should be stored (even though titles differ)
const sections = db
.prepare("SELECT doc_path, section_title FROM chunks WHERE content = ?")
.all(sharedContent) as { doc_path: string; section_title: string }[];

expect(sections.length).toBe(1);
// First occurrence wins
expect(sections[0].doc_path).toBe("packages/a/README.md");
expect(sections[0].section_title).toBe("Getting Started");
} finally {
db.close();
}
});

it("keeps sections with same title but different content", () => {
const files = [
{
Expand Down
10 changes: 4 additions & 6 deletions packages/context/src/package-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@ import { type DocSection, parseMarkdown } from "./build.js";
* Generate a content hash for section deduplication.
* Uses first 16 chars of MD5 (sufficient for detecting identical content).
*/
function sectionHash(section: DocSection): string {
// Hash by section title + content to identify duplicates across different files
const key = `${section.sectionTitle}\n${section.content}`;
return createHash("md5").update(key).digest("hex").slice(0, 16);
function contentHash(content: string): string {
return createHash("md5").update(content).digest("hex").slice(0, 16);
}

export interface PackageBuildOptions {
Expand Down Expand Up @@ -96,8 +94,8 @@ export function buildPackage(
try {
const parsed = parseMarkdown(file.content, file.path);
for (const section of parsed.sections) {
// Deduplicate sections with identical title + content
const hash = sectionHash(section);
// Deduplicate sections with identical content (ignore titles)
const hash = contentHash(section.content);
if (!seenHashes.has(hash)) {
seenHashes.add(hash);
allSections.push(section);
Expand Down