diff --git a/src/connectors/obsidian.ts b/src/connectors/obsidian.ts index 9739c5e..00a8d83 100644 --- a/src/connectors/obsidian.ts +++ b/src/connectors/obsidian.ts @@ -7,6 +7,7 @@ import { indexDocument } from "../core/indexing.js"; import { deleteDocument } from "../core/documents.js"; import { createTopic, listTopics } from "../core/topics.js"; import { addTagsToDocument, createTag } from "../core/tags.js"; +import { createLink, resolveDocumentByTitle } from "../core/links.js"; import { getLogger } from "../logger.js"; import { ValidationError } from "../errors.js"; import { loadConnectorConfig, saveConnectorConfig } from "./index.js"; @@ -357,6 +358,20 @@ export async function syncObsidianVault( } } + // Store wikilinks as document references + if (parsed.wikilinks.length > 0) { + for (const pageName of parsed.wikilinks) { + try { + const targetId = resolveDocumentByTitle(db, pageName); + if (targetId && targetId !== indexed.id) { + createLink(db, indexed.id, targetId, "references"); + } + } catch (err) { + log.debug({ err, pageName, docId: indexed.id }, "Failed to resolve wikilink"); + } + } + } + newTrackedFiles[relPath] = { mtime, docId: indexed.id }; if (tracked) { diff --git a/src/core/graph.ts b/src/core/graph.ts index a385d04..1f4b89c 100644 --- a/src/core/graph.ts +++ b/src/core/graph.ts @@ -11,7 +11,15 @@ export interface GraphNode { export interface GraphEdge { source: string; target: string; - type: "belongs_to_topic" | "has_tag" | "similar_to"; + type: + | "belongs_to_topic" + | "has_tag" + | "similar_to" + | "see_also" + | "prerequisite" + | "supersedes" + | "related" + | "references"; weight: number; // 0-1 } @@ -196,6 +204,22 @@ export function buildKnowledgeGraph( } } + // Add document link edges + const docLinks = db + .prepare("SELECT source_id, target_id, link_type FROM document_links") + .all() as Array<{ source_id: string; target_id: string; link_type: string }>; + + for (const link of docLinks) { + if (docIds.has(link.source_id) && docIds.has(link.target_id)) { + edges.push({ + source: link.source_id, + target: link.target_id, + type: link.link_type as GraphEdge["type"], + weight: 1, + }); + } + } + // Compute similarity edges from chunk embeddings if (includeSimilarity && documents.length > 1) { try { diff --git a/src/core/indexing.ts b/src/core/indexing.ts index 2a025d5..0f1a015 100644 --- a/src/core/indexing.ts +++ b/src/core/indexing.ts @@ -8,6 +8,7 @@ import { ValidationError } from "../errors.js"; import { getLogger } from "../logger.js"; import { checkDuplicate } from "./dedup.js"; import type { DedupOptions } from "./dedup.js"; +import { extractAndStoreDocumentLinks } from "./links.js"; import { getParserForFile, getSupportedExtensions } from "./parsers/index.js"; export interface IndexDocumentInput { @@ -470,6 +471,14 @@ export async function indexDocument( transaction(); log.info({ docId, chunkCount: chunks.length }, "Document indexed successfully"); + + // Extract and store document links (best-effort — don't fail indexing) + try { + extractAndStoreDocumentLinks(db, docId, input.content); + } catch (err) { + log.warn({ err, docId }, "Failed to extract document links"); + } + return { id: docId, chunkCount: chunks.length }; } diff --git a/src/core/link-extractor.ts b/src/core/link-extractor.ts index 5c7c889..ed6e59f 100644 --- a/src/core/link-extractor.ts +++ b/src/core/link-extractor.ts @@ -127,6 +127,46 @@ function extractHref(tag: string): string | null { * Resolve a potentially-relative href against a base URL. * Returns null if the result is not an http/https URL (e.g. mailto:, javascript:, data:, #fragment-only). */ +/** + * Extract markdown-style links from content. + * Parses [text](url) patterns and returns an array of {text, url} objects. + */ +export function extractMarkdownLinks(content: string): Array<{ text: string; url: string }> { + if (!content) return []; + + const results: Array<{ text: string; url: string }> = []; + const re = /\[([^\]]*)\]\(([^)]+)\)/g; + let match: RegExpExecArray | null; + + while ((match = re.exec(content)) !== null) { + results.push({ text: match[1]!, url: match[2]! }); + } + + return results; +} + +/** + * Extract wikilinks from content. + * Parses [[PageName]] and [[PageName|alias]] formats. + * Returns deduplicated array of page names. + */ +export function extractWikilinks(content: string): string[] { + if (!content) return []; + + const seen = new Set(); + const re = /\[\[([^\]|]+)(?:\|[^\]]+)?\]\]/g; + let match: RegExpExecArray | null; + + while ((match = re.exec(content)) !== null) { + const pageName = match[1]!.trim(); + if (pageName) { + seen.add(pageName); + } + } + + return [...seen]; +} + function resolveUrl(href: string, baseUrl: string): string | null { // Skip fragment-only links immediately — they point to the same page if (href.startsWith("#")) return null; diff --git a/src/core/links.ts b/src/core/links.ts index 673c801..e725d71 100644 --- a/src/core/links.ts +++ b/src/core/links.ts @@ -2,14 +2,16 @@ import type Database from "better-sqlite3"; import { randomUUID } from "node:crypto"; import { ValidationError, DocumentNotFoundError } from "../errors.js"; import { createChildLogger } from "../logger.js"; +import { extractMarkdownLinks, extractWikilinks } from "./link-extractor.js"; -export type LinkType = "see_also" | "prerequisite" | "supersedes" | "related"; +export type LinkType = "see_also" | "prerequisite" | "supersedes" | "related" | "references"; const VALID_LINK_TYPES: ReadonlySet = new Set([ "see_also", "prerequisite", "supersedes", "related", + "references", ]); export interface DocumentLink { @@ -224,3 +226,56 @@ export function listLinks(db: Database.Database, linkType?: LinkType): DocumentL const rows = db.prepare(sql).all(...params) as LinkRowWithTitles[]; return rows.map(rowToLinkWithTitle); } + +/** Resolve a document by its URL. Returns the document id or null. */ +export function resolveDocumentByUrl(db: Database.Database, url: string): string | null { + const row = db.prepare("SELECT id FROM documents WHERE url = ? LIMIT 1").get(url) as + | { id: string } + | undefined; + return row?.id ?? null; +} + +/** Resolve a document by its title (case-insensitive). Returns the document id or null. */ +export function resolveDocumentByTitle(db: Database.Database, title: string): string | null { + const row = db + .prepare("SELECT id FROM documents WHERE lower(title) = lower(?) LIMIT 1") + .get(title) as { id: string } | undefined; + return row?.id ?? null; +} + +/** Resolve a document reference by trying URL first, then title. Returns the document id or null. */ +export function resolveDocumentLink(db: Database.Database, ref: string): string | null { + return resolveDocumentByUrl(db, ref) ?? resolveDocumentByTitle(db, ref); +} + +/** Extract markdown and wiki links from content, resolve them, and store as "references" links. */ +export function extractAndStoreDocumentLinks( + db: Database.Database, + documentId: string, + content: string, +): void { + const log = createChildLogger({ operation: "extractAndStoreDocumentLinks" }); + const resolved = new Set(); + + for (const { url } of extractMarkdownLinks(content)) { + const targetId = resolveDocumentLink(db, url); + if (targetId && targetId !== documentId) { + resolved.add(targetId); + } + } + + for (const pageName of extractWikilinks(content)) { + const targetId = resolveDocumentLink(db, pageName); + if (targetId && targetId !== documentId) { + resolved.add(targetId); + } + } + + for (const targetId of resolved) { + try { + createLink(db, documentId, targetId, "references"); + } catch (e) { + log.warn({ documentId, targetId, error: e }, "Failed to create reference link, skipping"); + } + } +} diff --git a/tests/unit/document-links-resolution.test.ts b/tests/unit/document-links-resolution.test.ts new file mode 100644 index 0000000..8ff560a --- /dev/null +++ b/tests/unit/document-links-resolution.test.ts @@ -0,0 +1,109 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import type Database from "better-sqlite3"; +import { createTestDb } from "../fixtures/test-db.js"; +import { + resolveDocumentByUrl, + resolveDocumentByTitle, + resolveDocumentLink, + extractAndStoreDocumentLinks, + getDocumentLinks, +} from "../../src/core/links.js"; + +function insertDocWithUrl( + db: Database.Database, + id: string, + title: string, + url: string | null, +): void { + db.prepare( + `INSERT INTO documents (id, title, content, source_type, url) VALUES (?, ?, '', 'manual', ?)`, + ).run(id, title, url); +} + +describe("document link resolution", () => { + let db: Database.Database; + + beforeEach(() => { + db = createTestDb(); + insertDocWithUrl(db, "doc-a", "Document A", "https://example.com/a"); + insertDocWithUrl(db, "doc-b", "Document B", "https://example.com/b"); + insertDocWithUrl(db, "doc-c", "Document C", null); + }); + + afterEach(() => { + db.close(); + }); + + describe("resolveDocumentByUrl", () => { + it("should return document id for existing URL", () => { + expect(resolveDocumentByUrl(db, "https://example.com/a")).toBe("doc-a"); + }); + + it("should return null for nonexistent URL", () => { + expect(resolveDocumentByUrl(db, "https://example.com/missing")).toBeNull(); + }); + }); + + describe("resolveDocumentByTitle", () => { + it("should match case-insensitively", () => { + expect(resolveDocumentByTitle(db, "document a")).toBe("doc-a"); + expect(resolveDocumentByTitle(db, "DOCUMENT B")).toBe("doc-b"); + }); + + it("should return null when no match", () => { + expect(resolveDocumentByTitle(db, "Nonexistent Doc")).toBeNull(); + }); + }); + + describe("resolveDocumentLink", () => { + it("should resolve by URL first", () => { + expect(resolveDocumentLink(db, "https://example.com/a")).toBe("doc-a"); + }); + + it("should fall back to title when URL does not match", () => { + expect(resolveDocumentLink(db, "Document C")).toBe("doc-c"); + }); + + it("should return null if neither URL nor title matches", () => { + expect(resolveDocumentLink(db, "nothing matches")).toBeNull(); + }); + }); + + describe("extractAndStoreDocumentLinks", () => { + it("should create references links for resolvable markdown refs", () => { + const content = "See [Doc B](https://example.com/b) for more."; + extractAndStoreDocumentLinks(db, "doc-a", content); + + const { outgoing } = getDocumentLinks(db, "doc-a"); + expect(outgoing).toHaveLength(1); + expect(outgoing[0]!.targetId).toBe("doc-b"); + expect(outgoing[0]!.linkType).toBe("references"); + }); + + it("should create references links for resolvable wikilinks", () => { + const content = "See [[Document B]] for more."; + extractAndStoreDocumentLinks(db, "doc-a", content); + + const { outgoing } = getDocumentLinks(db, "doc-a"); + expect(outgoing).toHaveLength(1); + expect(outgoing[0]!.targetId).toBe("doc-b"); + expect(outgoing[0]!.linkType).toBe("references"); + }); + + it("should skip unresolvable references", () => { + const content = "See [missing](https://example.com/missing) and [[Unknown Page]]."; + extractAndStoreDocumentLinks(db, "doc-a", content); + + const { outgoing } = getDocumentLinks(db, "doc-a"); + expect(outgoing).toHaveLength(0); + }); + + it("should skip self-links", () => { + const content = "See [self](https://example.com/a)."; + extractAndStoreDocumentLinks(db, "doc-a", content); + + const { outgoing } = getDocumentLinks(db, "doc-a"); + expect(outgoing).toHaveLength(0); + }); + }); +}); diff --git a/tests/unit/graph-document-links.test.ts b/tests/unit/graph-document-links.test.ts new file mode 100644 index 0000000..f3fb44b --- /dev/null +++ b/tests/unit/graph-document-links.test.ts @@ -0,0 +1,33 @@ +import { describe, it, expect, beforeEach } from "vitest"; +import type Database from "better-sqlite3"; +import { createTestDb } from "../fixtures/test-db.js"; +import { buildKnowledgeGraph } from "../../src/core/graph.js"; +import { createLink } from "../../src/core/links.js"; + +function insertDocument(db: Database.Database, id: string, title: string): void { + db.prepare( + `INSERT INTO documents (id, source_type, title, content) VALUES (?, 'manual', ?, 'content')`, + ).run(id, title); +} + +describe("buildKnowledgeGraph with document_links", () => { + let db: Database.Database; + + beforeEach(() => { + db = createTestDb(); + }); + + it("should include document_links edges with type references", async () => { + insertDocument(db, "d1", "Doc One"); + insertDocument(db, "d2", "Doc Two"); + createLink(db, "d1", "d2", "references"); + + const graph = await buildKnowledgeGraph(db, { includeSimilarityEdges: false }); + + const refEdges = graph.edges.filter((e) => e.type === "references"); + expect(refEdges).toHaveLength(1); + expect(refEdges[0]!.source).toBe("d1"); + expect(refEdges[0]!.target).toBe("d2"); + expect(refEdges[0]!.weight).toBe(1); + }); +}); diff --git a/tests/unit/link-extractor.test.ts b/tests/unit/link-extractor.test.ts index f623eba..c6731f7 100644 --- a/tests/unit/link-extractor.test.ts +++ b/tests/unit/link-extractor.test.ts @@ -1,5 +1,9 @@ import { describe, it, expect } from "vitest"; -import { extractLinks } from "../../src/core/link-extractor.js"; +import { + extractLinks, + extractMarkdownLinks, + extractWikilinks, +} from "../../src/core/link-extractor.js"; const BASE = "https://example.com/docs/intro"; @@ -153,3 +157,45 @@ describe("extractLinks", () => { expect(links).toContain("https://example.com/https"); }); }); + +describe("extractMarkdownLinks", () => { + it("extracts a standard markdown link", () => { + const result = extractMarkdownLinks("[click here](https://example.com)"); + expect(result).toEqual([{ text: "click here", url: "https://example.com" }]); + }); + + it("extracts multiple links", () => { + const content = "See [a](https://a.com) and [b](https://b.com)."; + const result = extractMarkdownLinks(content); + expect(result).toHaveLength(2); + expect(result[0]).toEqual({ text: "a", url: "https://a.com" }); + expect(result[1]).toEqual({ text: "b", url: "https://b.com" }); + }); + + it("returns empty array when no links present", () => { + expect(extractMarkdownLinks("no links here")).toEqual([]); + }); + + it("returns empty array for empty string", () => { + expect(extractMarkdownLinks("")).toEqual([]); + }); +}); + +describe("extractWikilinks", () => { + it("extracts a simple wikilink", () => { + expect(extractWikilinks("See [[MyPage]] for details")).toEqual(["MyPage"]); + }); + + it("extracts page name from aliased wikilink, not the alias", () => { + expect(extractWikilinks("See [[MyPage|display text]]")).toEqual(["MyPage"]); + }); + + it("extracts multiple wikilinks and deduplicates", () => { + const result = extractWikilinks("[[A]] then [[B]] then [[A]] again"); + expect(result).toEqual(["A", "B"]); + }); + + it("returns empty array for empty string", () => { + expect(extractWikilinks("")).toEqual([]); + }); +});