Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/connectors/obsidian.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { indexDocument } from "../core/indexing.js";
import { deleteDocument } from "../core/documents.js";
import { createTopic, listTopics } from "../core/topics.js";
import { addTagsToDocument, createTag } from "../core/tags.js";
import { createLink, resolveDocumentByTitle } from "../core/links.js";
import { getLogger } from "../logger.js";
import { ValidationError } from "../errors.js";
import { loadConnectorConfig, saveConnectorConfig } from "./index.js";
Expand Down Expand Up @@ -357,6 +358,20 @@ export async function syncObsidianVault(
}
}

// Store wikilinks as document references
if (parsed.wikilinks.length > 0) {
for (const pageName of parsed.wikilinks) {
try {
const targetId = resolveDocumentByTitle(db, pageName);
if (targetId && targetId !== indexed.id) {
createLink(db, indexed.id, targetId, "references");
}
} catch (err) {
log.debug({ err, pageName, docId: indexed.id }, "Failed to resolve wikilink");
}
}
}

newTrackedFiles[relPath] = { mtime, docId: indexed.id };

if (tracked) {
Expand Down
26 changes: 25 additions & 1 deletion src/core/graph.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,15 @@ export interface GraphNode {
export interface GraphEdge {
source: string;
target: string;
type: "belongs_to_topic" | "has_tag" | "similar_to";
type:
| "belongs_to_topic"
| "has_tag"
| "similar_to"
| "see_also"
| "prerequisite"
| "supersedes"
| "related"
| "references";
weight: number; // 0-1
}

Expand Down Expand Up @@ -196,6 +204,22 @@ export function buildKnowledgeGraph(
}
}

// Add document link edges
const docLinks = db
.prepare("SELECT source_id, target_id, link_type FROM document_links")
.all() as Array<{ source_id: string; target_id: string; link_type: string }>;

for (const link of docLinks) {
if (docIds.has(link.source_id) && docIds.has(link.target_id)) {
edges.push({
source: link.source_id,
target: link.target_id,
type: link.link_type as GraphEdge["type"],
weight: 1,
});
}
}

// Compute similarity edges from chunk embeddings
if (includeSimilarity && documents.length > 1) {
try {
Expand Down
9 changes: 9 additions & 0 deletions src/core/indexing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { ValidationError } from "../errors.js";
import { getLogger } from "../logger.js";
import { checkDuplicate } from "./dedup.js";
import type { DedupOptions } from "./dedup.js";
import { extractAndStoreDocumentLinks } from "./links.js";
import { getParserForFile, getSupportedExtensions } from "./parsers/index.js";

export interface IndexDocumentInput {
Expand Down Expand Up @@ -470,6 +471,14 @@ export async function indexDocument(
transaction();

log.info({ docId, chunkCount: chunks.length }, "Document indexed successfully");

// Extract and store document links (best-effort — don't fail indexing)
try {
extractAndStoreDocumentLinks(db, docId, input.content);
} catch (err) {
log.warn({ err, docId }, "Failed to extract document links");
}

return { id: docId, chunkCount: chunks.length };
}

Expand Down
40 changes: 40 additions & 0 deletions src/core/link-extractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,46 @@
* Resolve a potentially-relative href against a base URL.
* Returns null if the result is not an http/https URL (e.g. mailto:, javascript:, data:, #fragment-only).
*/
/**
* Extract markdown-style links from content.
* Parses [text](url) patterns and returns an array of {text, url} objects.
*/
export function extractMarkdownLinks(content: string): Array<{ text: string; url: string }> {
if (!content) return [];

const results: Array<{ text: string; url: string }> = [];
const re = /\[([^\]]*)\]\(([^)]+)\)/g;
let match: RegExpExecArray | null;

while ((match = re.exec(content)) !== null) {

Check failure

Code scanning / CodeQL

Polynomial regular expression used on uncontrolled data High

This
regular expression
that depends on
library input
may run slow on strings starting with '[' and with many repetitions of '[\'.
This
regular expression
that depends on
library input
may run slow on strings starting with '[](' and with many repetitions of '[](('.
This
regular expression
that depends on
library input
may run slow on strings starting with '[' and with many repetitions of '[\'.
This
regular expression
that depends on
library input
may run slow on strings starting with '[](' and with many repetitions of '[](('.
results.push({ text: match[1]!, url: match[2]! });
}

return results;
}

/**
* Extract wikilinks from content.
* Parses [[PageName]] and [[PageName|alias]] formats.
* Returns deduplicated array of page names.
*/
export function extractWikilinks(content: string): string[] {
if (!content) return [];

const seen = new Set<string>();
const re = /\[\[([^\]|]+)(?:\|[^\]]+)?\]\]/g;
let match: RegExpExecArray | null;

while ((match = re.exec(content)) !== null) {

Check failure

Code scanning / CodeQL

Polynomial regular expression used on uncontrolled data High

This
regular expression
that depends on
library input
may run slow on strings starting with '[[' and with many repetitions of '[[\'.
This
regular expression
that depends on
library input
may run slow on strings starting with '[[\|' and with many repetitions of '[[\|'.
This
regular expression
that depends on
library input
may run slow on strings starting with '[[' and with many repetitions of '[[\'.
This
regular expression
that depends on
library input
may run slow on strings starting with '[[\|' and with many repetitions of '[[\|'.
const pageName = match[1]!.trim();
if (pageName) {
seen.add(pageName);
}
}

return [...seen];
}

function resolveUrl(href: string, baseUrl: string): string | null {
// Skip fragment-only links immediately — they point to the same page
if (href.startsWith("#")) return null;
Expand Down
57 changes: 56 additions & 1 deletion src/core/links.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@ import type Database from "better-sqlite3";
import { randomUUID } from "node:crypto";
import { ValidationError, DocumentNotFoundError } from "../errors.js";
import { createChildLogger } from "../logger.js";
import { extractMarkdownLinks, extractWikilinks } from "./link-extractor.js";

export type LinkType = "see_also" | "prerequisite" | "supersedes" | "related";
export type LinkType = "see_also" | "prerequisite" | "supersedes" | "related" | "references";

const VALID_LINK_TYPES: ReadonlySet<string> = new Set<LinkType>([
"see_also",
"prerequisite",
"supersedes",
"related",
"references",
]);

export interface DocumentLink {
Expand Down Expand Up @@ -224,3 +226,56 @@ export function listLinks(db: Database.Database, linkType?: LinkType): DocumentL
const rows = db.prepare(sql).all(...params) as LinkRowWithTitles[];
return rows.map(rowToLinkWithTitle);
}

/** Resolve a document by its URL. Returns the document id or null. */
export function resolveDocumentByUrl(db: Database.Database, url: string): string | null {
const row = db.prepare("SELECT id FROM documents WHERE url = ? LIMIT 1").get(url) as
| { id: string }
| undefined;
return row?.id ?? null;
}

/** Resolve a document by its title (case-insensitive). Returns the document id or null. */
export function resolveDocumentByTitle(db: Database.Database, title: string): string | null {
const row = db
.prepare("SELECT id FROM documents WHERE lower(title) = lower(?) LIMIT 1")
.get(title) as { id: string } | undefined;
return row?.id ?? null;
}

/** Resolve a document reference by trying URL first, then title. Returns the document id or null. */
export function resolveDocumentLink(db: Database.Database, ref: string): string | null {
return resolveDocumentByUrl(db, ref) ?? resolveDocumentByTitle(db, ref);
}

/** Extract markdown and wiki links from content, resolve them, and store as "references" links. */
export function extractAndStoreDocumentLinks(
db: Database.Database,
documentId: string,
content: string,
): void {
const log = createChildLogger({ operation: "extractAndStoreDocumentLinks" });
const resolved = new Set<string>();

for (const { url } of extractMarkdownLinks(content)) {
const targetId = resolveDocumentLink(db, url);
if (targetId && targetId !== documentId) {
resolved.add(targetId);
}
}

for (const pageName of extractWikilinks(content)) {
const targetId = resolveDocumentLink(db, pageName);
if (targetId && targetId !== documentId) {
resolved.add(targetId);
}
}

for (const targetId of resolved) {
try {
createLink(db, documentId, targetId, "references");
} catch (e) {
log.warn({ documentId, targetId, error: e }, "Failed to create reference link, skipping");
}
}
}
109 changes: 109 additions & 0 deletions tests/unit/document-links-resolution.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import { describe, it, expect, beforeEach, afterEach } from "vitest";
import type Database from "better-sqlite3";
import { createTestDb } from "../fixtures/test-db.js";
import {
resolveDocumentByUrl,
resolveDocumentByTitle,
resolveDocumentLink,
extractAndStoreDocumentLinks,
getDocumentLinks,
} from "../../src/core/links.js";

function insertDocWithUrl(
db: Database.Database,
id: string,
title: string,
url: string | null,
): void {
db.prepare(
`INSERT INTO documents (id, title, content, source_type, url) VALUES (?, ?, '', 'manual', ?)`,
).run(id, title, url);
}

describe("document link resolution", () => {
let db: Database.Database;

beforeEach(() => {
db = createTestDb();
insertDocWithUrl(db, "doc-a", "Document A", "https://example.com/a");
insertDocWithUrl(db, "doc-b", "Document B", "https://example.com/b");
insertDocWithUrl(db, "doc-c", "Document C", null);
});

afterEach(() => {
db.close();
});

describe("resolveDocumentByUrl", () => {
it("should return document id for existing URL", () => {
expect(resolveDocumentByUrl(db, "https://example.com/a")).toBe("doc-a");
});

it("should return null for nonexistent URL", () => {
expect(resolveDocumentByUrl(db, "https://example.com/missing")).toBeNull();
});
});

describe("resolveDocumentByTitle", () => {
it("should match case-insensitively", () => {
expect(resolveDocumentByTitle(db, "document a")).toBe("doc-a");
expect(resolveDocumentByTitle(db, "DOCUMENT B")).toBe("doc-b");
});

it("should return null when no match", () => {
expect(resolveDocumentByTitle(db, "Nonexistent Doc")).toBeNull();
});
});

describe("resolveDocumentLink", () => {
it("should resolve by URL first", () => {
expect(resolveDocumentLink(db, "https://example.com/a")).toBe("doc-a");
});

it("should fall back to title when URL does not match", () => {
expect(resolveDocumentLink(db, "Document C")).toBe("doc-c");
});

it("should return null if neither URL nor title matches", () => {
expect(resolveDocumentLink(db, "nothing matches")).toBeNull();
});
});

describe("extractAndStoreDocumentLinks", () => {
it("should create references links for resolvable markdown refs", () => {
const content = "See [Doc B](https://example.com/b) for more.";
extractAndStoreDocumentLinks(db, "doc-a", content);

const { outgoing } = getDocumentLinks(db, "doc-a");
expect(outgoing).toHaveLength(1);
expect(outgoing[0]!.targetId).toBe("doc-b");
expect(outgoing[0]!.linkType).toBe("references");
});

it("should create references links for resolvable wikilinks", () => {
const content = "See [[Document B]] for more.";
extractAndStoreDocumentLinks(db, "doc-a", content);

const { outgoing } = getDocumentLinks(db, "doc-a");
expect(outgoing).toHaveLength(1);
expect(outgoing[0]!.targetId).toBe("doc-b");
expect(outgoing[0]!.linkType).toBe("references");
});

it("should skip unresolvable references", () => {
const content = "See [missing](https://example.com/missing) and [[Unknown Page]].";
extractAndStoreDocumentLinks(db, "doc-a", content);

const { outgoing } = getDocumentLinks(db, "doc-a");
expect(outgoing).toHaveLength(0);
});

it("should skip self-links", () => {
const content = "See [self](https://example.com/a).";
extractAndStoreDocumentLinks(db, "doc-a", content);

const { outgoing } = getDocumentLinks(db, "doc-a");
expect(outgoing).toHaveLength(0);
});
});
});
33 changes: 33 additions & 0 deletions tests/unit/graph-document-links.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { describe, it, expect, beforeEach } from "vitest";
import type Database from "better-sqlite3";
import { createTestDb } from "../fixtures/test-db.js";
import { buildKnowledgeGraph } from "../../src/core/graph.js";
import { createLink } from "../../src/core/links.js";

function insertDocument(db: Database.Database, id: string, title: string): void {
db.prepare(
`INSERT INTO documents (id, source_type, title, content) VALUES (?, 'manual', ?, 'content')`,
).run(id, title);
}

describe("buildKnowledgeGraph with document_links", () => {
let db: Database.Database;

beforeEach(() => {
db = createTestDb();
});

it("should include document_links edges with type references", async () => {
insertDocument(db, "d1", "Doc One");
insertDocument(db, "d2", "Doc Two");
createLink(db, "d1", "d2", "references");

const graph = await buildKnowledgeGraph(db, { includeSimilarityEdges: false });

const refEdges = graph.edges.filter((e) => e.type === "references");
expect(refEdges).toHaveLength(1);
expect(refEdges[0]!.source).toBe("d1");
expect(refEdges[0]!.target).toBe("d2");
expect(refEdges[0]!.weight).toBe(1);
});
});
Loading
Loading