diff --git a/README.md b/README.md index 47f534a..28342d4 100644 --- a/README.md +++ b/README.md @@ -202,6 +202,7 @@ LocalRecall uses environment variables to configure its behavior. These variable | `LISTENING_ADDRESS` | Address the server listens on (default: `:8080`). Useful for deployments on custom ports or network interfaces. | | `VECTOR_ENGINE` | Vector database engine to use (`chromem` by default, `postgres` for PostgreSQL). | | `MAX_CHUNKING_SIZE` | Maximum size (in characters) for breaking down documents into chunks. Affects performance and accuracy. | +| `CHUNK_OVERLAP` | Overlap in characters between consecutive chunks (word-aligned). Default: 0. Use to improve context across chunk boundaries. | | `HYBRID_SEARCH_BM25_WEIGHT` | Weight for BM25 keyword search in hybrid search (default: 0.5, PostgreSQL only). | | `HYBRID_SEARCH_VECTOR_WEIGHT` | Weight for vector similarity search in hybrid search (default: 0.5, PostgreSQL only). | | `API_KEYS` | Comma-separated list of API keys for securing access to the REST API (optional). | @@ -246,6 +247,14 @@ curl -X GET $BASE_URL/collections curl -X GET $BASE_URL/collections/myCollection/entries ``` +- **Get Entry Content**: + +```sh +curl -X GET $BASE_URL/collections/myCollection/entries/file.txt +``` + +Returns `collection`, `entry`, `chunks` (array of `id`, `content`, `metadata`), and `count`. + - **Search Collection**: ```sh diff --git a/main.go b/main.go index 1f38368..1bc4b91 100644 --- a/main.go +++ b/main.go @@ -21,6 +21,7 @@ var ( listeningAddress = os.Getenv("LISTENING_ADDRESS") vectorEngine = os.Getenv("VECTOR_ENGINE") maxChunkingSize = os.Getenv("MAX_CHUNKING_SIZE") + chunkOverlap = os.Getenv("CHUNK_OVERLAP") apiKeys = os.Getenv("API_KEYS") gitPrivateKey = os.Getenv("GIT_PRIVATE_KEY") sourceManager = rag.NewSourceManager(&sources.Config{ @@ -77,7 +78,16 @@ func startAPI(listenAddress string) { } } - registerAPIRoutes(e, openAIClient, chunkingSize, keys) + overlap := 0 + if chunkOverlap != "" { + var err error + overlap, err = strconv.Atoi(chunkOverlap) + if err != nil { + e.Logger.Fatal("Failed to convert CHUNK_OVERLAP to integer") + } + } + + registerAPIRoutes(e, openAIClient, chunkingSize, overlap, keys) e.Logger.Fatal(e.Start(listenAddress)) } diff --git a/pkg/chunk/chunk_test.go b/pkg/chunk/chunk_test.go index e7bb74f..c25e8a5 100644 --- a/pkg/chunk/chunk_test.go +++ b/pkg/chunk/chunk_test.go @@ -1,6 +1,8 @@ package chunk_test import ( + "strings" + . "github.com/mudler/localrecall/pkg/chunk" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -45,5 +47,83 @@ var _ = Describe("Chunk", func() { chunks := SplitParagraphIntoChunks(text, 30) Expect(chunks).ToNot(BeEmpty()) }) + + It("should split words longer than maxChunkSize into chunks each <= maxChunkSize", func() { + text := "normal verylongwordhere end" + chunks := SplitParagraphIntoChunks(text, 5) + Expect(chunks).ToNot(BeEmpty()) + for _, c := range chunks { + Expect(len(c)).To(BeNumerically("<=", 5)) + } + // "verylongwordhere" (16 chars) with max 5 -> 4 chunks of 5,5,5,1 + Expect(chunks).To(ContainElement("veryl")) + Expect(chunks).To(ContainElement("ongwo")) + Expect(chunks).To(ContainElement("rdher")) + Expect(chunks).To(ContainElement("e")) + }) + + It("backward compatibility: SplitParagraphIntoChunks matches Options with Overlap 0", func() { + text := "This is a very long text that should be split into multiple chunks." + chunksLegacy := SplitParagraphIntoChunks(text, 20) + chunksOpts := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 20, Overlap: 0, SplitLongWords: true}) + Expect(chunksLegacy).To(Equal(chunksOpts)) + }) + }) + + Describe("SplitParagraphIntoChunksWithOptions", func() { + It("should apply overlap between consecutive chunks", func() { + text := "one two three four five six seven eight nine ten" + chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 20, Overlap: 10}) + Expect(chunks).ToNot(BeEmpty()) + for _, c := range chunks { + Expect(len(c)).To(BeNumerically("<=", 20)) + } + // Consecutive chunks should share a suffix/prefix + for i := 0; i < len(chunks)-1; i++ { + tail := chunks[i] + head := chunks[i+1] + // Some overlap: tail of chunk i should appear at start of chunk i+1 + found := false + for n := 1; n <= len(tail) && n <= len(head); n++ { + if tail[len(tail)-n:] == head[:n] { + found = true + break + } + } + // Or head starts with last words of tail (word-aligned) + wordsTail := strings.Fields(tail) + if len(wordsTail) > 0 { + lastWord := wordsTail[len(wordsTail)-1] + if strings.HasPrefix(head, lastWord) || head == lastWord { + found = true + } + } + Expect(found).To(BeTrue(), "chunk %d and %d should share overlap", i, i+1) + } + }) + + It("Overlap 0 matches no overlap", func() { + text := "a b c d e f g h i j k l m n o p" + chunksOverlap := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 10, Overlap: 0}) + chunksLegacy := SplitParagraphIntoChunks(text, 10) + Expect(chunksOverlap).To(Equal(chunksLegacy)) + }) + + It("SplitLongWords false allows a single word to exceed MaxSize as one chunk", func() { + text := "short verylongwordhere end" + chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 5, Overlap: 0, SplitLongWords: false}) + Expect(chunks).To(ContainElement("verylongwordhere")) + Expect(chunks).To(ContainElement("short")) + Expect(chunks).To(ContainElement("end")) + }) + + It("clamps Overlap >= MaxSize to MaxSize-1", func() { + text := "one two three four five" + chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 10, Overlap: 99}) + Expect(chunks).ToNot(BeEmpty()) + for _, c := range chunks { + Expect(len(c)).To(BeNumerically("<=", 10)) + } + }) }) }) diff --git a/pkg/chunk/chunking.go b/pkg/chunk/chunking.go index 71fd499..31231a4 100644 --- a/pkg/chunk/chunking.go +++ b/pkg/chunk/chunking.go @@ -4,43 +4,188 @@ import ( "strings" ) -// SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input, -// and returns a slice of strings where each string is a chunk of the paragraph -// that is at most maxChunkSize long, ensuring that words are not split. -func SplitParagraphIntoChunks(paragraph string, maxChunkSize int) []string { - if len(paragraph) <= maxChunkSize { - return []string{paragraph} +// Options configures paragraph chunking. +type Options struct { + // MaxSize is the maximum characters per chunk (required, must be > 0). + MaxSize int + // Overlap is the overlap in characters between consecutive chunks, word-aligned (0 = no overlap). + // Must be < MaxSize; values >= MaxSize are clamped to MaxSize-1. + Overlap int + // SplitLongWords, when true, splits words longer than MaxSize into smaller chunks so no chunk exceeds MaxSize (default true). + SplitLongWords bool +} + +// splitLongString splits s into pieces of at most maxSize characters. +// Returns a slice of substrings; each has length <= maxSize. +func splitLongString(s string, maxSize int) []string { + if maxSize <= 0 || len(s) <= maxSize { + return []string{s} + } + var pieces []string + for len(s) > 0 { + n := maxSize + if n > len(s) { + n = len(s) + } + pieces = append(pieces, s[:n]) + s = s[n:] } + return pieces +} +// overlapTail returns the suffix of chunk that is at most overlap characters and word-aligned (whole words only). +// If overlap is 0 or chunk is empty, returns "". +func overlapTail(chunk string, overlap int) string { + if overlap <= 0 || chunk == "" { + return "" + } + words := strings.Fields(chunk) + if len(words) == 0 { + return "" + } + // Take words from the end until we would exceed overlap (length includes spaces between words). + var tail []string + length := 0 + for i := len(words) - 1; i >= 0; i-- { + w := words[i] + addLen := len(w) + if len(tail) > 0 { + addLen++ // space before this word + } + if length+addLen > overlap { + break + } + tail = append([]string{w}, tail...) + length += addLen + } + return strings.Join(tail, " ") +} + +// SplitParagraphIntoChunksWithOptions splits a paragraph into chunks according to opts. +// Chunks are word-boundary aligned; consecutive chunks may overlap by opts.Overlap characters (word-aligned). +// Words longer than opts.MaxSize are split into smaller chunks when opts.SplitLongWords is true. +func SplitParagraphIntoChunksWithOptions(paragraph string, opts Options) []string { + maxSize := opts.MaxSize + if maxSize <= 0 { + maxSize = 1 + } + overlap := opts.Overlap + if overlap >= maxSize { + overlap = maxSize - 1 + } + if overlap < 0 { + overlap = 0 + } + splitLongWords := opts.SplitLongWords + + // Empty or single-chunk within limit (no overlap needed) + if paragraph == "" { + return []string{""} + } + if len(paragraph) <= maxSize && overlap == 0 { + words := strings.Fields(paragraph) + needSplit := false + for _, w := range words { + if len(w) > maxSize && splitLongWords { + needSplit = true + break + } + } + if !needSplit { + return []string{paragraph} + } + } + + words := strings.Fields(paragraph) var chunks []string var currentChunk strings.Builder - - words := strings.Fields(paragraph) // Splits the paragraph into words. + var overlapPrefix string // word-aligned prefix for next chunk (from previous chunk's tail) for _, word := range words { - // If adding the next word would exceed maxChunkSize (considering a space if not the first word in a chunk), - // add the currentChunk to chunks, and reset currentChunk. - if currentChunk.Len() > 0 && currentChunk.Len()+len(word)+1 > maxChunkSize { // +1 for the space if not the first word - chunks = append(chunks, currentChunk.String()) - currentChunk.Reset() - } else if currentChunk.Len() == 0 && len(word) > maxChunkSize { // Word itself exceeds maxChunkSize, split the word - chunks = append(chunks, word) + // Long word: split into pieces when SplitLongWords is true + if len(word) > maxSize && splitLongWords { + // Flush current chunk first + if currentChunk.Len() > 0 { + chunks = append(chunks, currentChunk.String()) + if overlap > 0 { + overlapPrefix = overlapTail(currentChunk.String(), overlap) + } else { + overlapPrefix = "" + } + currentChunk.Reset() + } + pieces := splitLongString(word, maxSize) + for _, p := range pieces { + chunks = append(chunks, p) + if overlap > 0 { + overlapPrefix = overlapTail(p, overlap) + } + } continue } - // Add a space before the word if it's not the beginning of a new chunk. + // Normal word: compute length if we add this word + var nextLen int if currentChunk.Len() > 0 { - currentChunk.WriteString(" ") + nextLen = currentChunk.Len() + 1 + len(word) + } else if overlapPrefix != "" { + nextLen = len(overlapPrefix) + 1 + len(word) + } else { + nextLen = len(word) } - // Add the word to the current chunk. - currentChunk.WriteString(word) + if nextLen > maxSize { + // Flush current chunk + if currentChunk.Len() > 0 { + chunks = append(chunks, currentChunk.String()) + if overlap > 0 { + overlapPrefix = overlapTail(currentChunk.String(), overlap) + } else { + overlapPrefix = "" + } + currentChunk.Reset() + } + // Start new chunk with overlap prefix only if it fits with the word + if overlapPrefix != "" && len(overlapPrefix)+1+len(word) <= maxSize { + currentChunk.WriteString(overlapPrefix) + currentChunk.WriteString(" ") + currentChunk.WriteString(word) + overlapPrefix = "" + } else { + currentChunk.WriteString(word) + overlapPrefix = "" + } + } else { + if currentChunk.Len() == 0 && overlapPrefix != "" { + currentChunk.WriteString(overlapPrefix) + currentChunk.WriteString(" ") + currentChunk.WriteString(word) + overlapPrefix = "" + } else if currentChunk.Len() > 0 { + currentChunk.WriteString(" ") + currentChunk.WriteString(word) + } else { + currentChunk.WriteString(word) + } + } } - // After the loop, add any remaining content in currentChunk to chunks. if currentChunk.Len() > 0 { chunks = append(chunks, currentChunk.String()) } return chunks } + +// SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input, +// and returns a slice of strings where each string is a chunk of the paragraph +// that is at most maxChunkSize long, ensuring that words are not split. +// Words longer than maxChunkSize are split into smaller chunks. +// For overlap and other options, use SplitParagraphIntoChunksWithOptions. +func SplitParagraphIntoChunks(paragraph string, maxChunkSize int) []string { + return SplitParagraphIntoChunksWithOptions(paragraph, Options{ + MaxSize: maxChunkSize, + Overlap: 0, + SplitLongWords: true, + }) +} diff --git a/pkg/client/client.go b/pkg/client/client.go index a87b430..fcfa320 100644 --- a/pkg/client/client.go +++ b/pkg/client/client.go @@ -8,6 +8,7 @@ import ( "io" "mime/multipart" "net/http" + "net/url" "os" "github.com/mudler/localrecall/rag/types" @@ -74,7 +75,7 @@ func (c *Client) ListCollections() ([]string, error) { return collections, nil } -// ListCollections lists all collections +// ListEntries lists all entries in a collection func (c *Client) ListEntries(collection string) ([]string, error) { url := fmt.Sprintf("%s/api/collections/%s/entries", c.BaseURL, collection) @@ -85,16 +86,61 @@ func (c *Client) ListEntries(collection string) ([]string, error) { defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - return nil, errors.New("failed to list collections") + return nil, errors.New("failed to list entries") + } + + var result struct { + Data struct { + Entries []string `json:"entries"` + } `json:"data"` + } + err = json.NewDecoder(resp.Body).Decode(&result) + if err != nil { + return nil, err + } + + return result.Data.Entries, nil +} + +// EntryChunk is a single chunk of an entry's content (id, content, metadata only). +type EntryChunk struct { + ID string `json:"id"` + Content string `json:"content"` + Metadata map[string]string `json:"metadata"` +} + +// GetEntryContent returns the chunks (id, content, metadata) for a specific entry in a collection. +func (c *Client) GetEntryContent(collection, entry string) ([]EntryChunk, error) { + apiURL := fmt.Sprintf("%s/api/collections/%s/entries/%s", c.BaseURL, collection, url.PathEscape(entry)) + + resp, err := http.Get(apiURL) + if err != nil { + return nil, err } + defer resp.Body.Close() - var entries []string - err = json.NewDecoder(resp.Body).Decode(&entries) + switch resp.StatusCode { + case http.StatusOK: + break + case http.StatusNotFound: + return nil, errors.New("collection or entry not found") + case http.StatusNotImplemented: + return nil, errors.New("this collection backend does not support listing entry content") + default: + return nil, fmt.Errorf("failed to get entry content: status %d", resp.StatusCode) + } + + var result struct { + Data struct { + Chunks []EntryChunk `json:"chunks"` + } `json:"data"` + } + err = json.NewDecoder(resp.Body).Decode(&result) if err != nil { return nil, err } - return entries, nil + return result.Data.Chunks, nil } // DeleteEntry deletes an Entry in a collection and return the entries left diff --git a/rag/collection.go b/rag/collection.go index 94197b1..443f9ad 100644 --- a/rag/collection.go +++ b/rag/collection.go @@ -15,7 +15,7 @@ import ( const collectionPrefix = "collection-" // NewPersistentChromeCollection creates a new persistent knowledge base collection using the ChromemDB engine -func NewPersistentChromeCollection(llmClient *openai.Client, collectionName, dbPath, filePath, embeddingModel string, maxChunkSize int) *PersistentKB { +func NewPersistentChromeCollection(llmClient *openai.Client, collectionName, dbPath, filePath, embeddingModel string, maxChunkSize, chunkOverlap int) *PersistentKB { chromemDB, err := engine.NewChromemDBCollection(collectionName, dbPath, llmClient, embeddingModel) if err != nil { xlog.Error("Failed to create ChromemDB", err) @@ -26,7 +26,7 @@ func NewPersistentChromeCollection(llmClient *openai.Client, collectionName, dbP filepath.Join(dbPath, fmt.Sprintf("%s%s.json", collectionPrefix, collectionName)), filePath, chromemDB, - maxChunkSize, llmClient, embeddingModel) + maxChunkSize, chunkOverlap, llmClient, embeddingModel) if err != nil { xlog.Error("Failed to create PersistentKB", err) os.Exit(1) @@ -36,7 +36,7 @@ func NewPersistentChromeCollection(llmClient *openai.Client, collectionName, dbP } // NewPersistentLocalAICollection creates a new persistent knowledge base collection using the LocalAI stores engine -func NewPersistentLocalAICollection(llmClient *openai.Client, apiURL, apiKey, collectionName, dbPath, filePath, embeddingModel string, maxChunkSize int) *PersistentKB { +func NewPersistentLocalAICollection(llmClient *openai.Client, apiURL, apiKey, collectionName, dbPath, filePath, embeddingModel string, maxChunkSize, chunkOverlap int) *PersistentKB { laiStore := localai.NewStoreClient(apiURL, apiKey) ragDB := engine.NewLocalAIRAGDB(laiStore, llmClient, embeddingModel) @@ -44,7 +44,7 @@ func NewPersistentLocalAICollection(llmClient *openai.Client, apiURL, apiKey, co filepath.Join(dbPath, fmt.Sprintf("%s%s.json", collectionPrefix, collectionName)), filePath, ragDB, - maxChunkSize, llmClient, embeddingModel) + maxChunkSize, chunkOverlap, llmClient, embeddingModel) if err != nil { xlog.Error("Failed to create PersistentKB", err) os.Exit(1) @@ -58,7 +58,7 @@ func NewPersistentLocalAICollection(llmClient *openai.Client, apiURL, apiKey, co } // NewPersistentPostgresCollection creates a new persistent knowledge base collection using the PostgreSQL engine -func NewPersistentPostgresCollection(llmClient *openai.Client, collectionName, dbPath, filePath, embeddingModel string, maxChunkSize int, databaseURL string) *PersistentKB { +func NewPersistentPostgresCollection(llmClient *openai.Client, collectionName, dbPath, filePath, embeddingModel string, maxChunkSize, chunkOverlap int, databaseURL string) *PersistentKB { postgresDB, err := engine.NewPostgresDBCollection(collectionName, databaseURL, llmClient, embeddingModel) if err != nil { xlog.Error("Failed to create PostgresDB", err) @@ -69,7 +69,7 @@ func NewPersistentPostgresCollection(llmClient *openai.Client, collectionName, d filepath.Join(dbPath, fmt.Sprintf("%s%s.json", collectionPrefix, collectionName)), filePath, postgresDB, - maxChunkSize, llmClient, embeddingModel) + maxChunkSize, chunkOverlap, llmClient, embeddingModel) if err != nil { xlog.Error("Failed to create PersistentKB", err) os.Exit(1) diff --git a/rag/persistency.go b/rag/persistency.go index 29d93ea..a8fb146 100644 --- a/rag/persistency.go +++ b/rag/persistency.go @@ -31,6 +31,7 @@ type PersistentKB struct { path string assetDir string maxChunkSize int + chunkOverlap int sources []*ExternalSource index map[string][]engine.Result @@ -57,7 +58,7 @@ func loadDB(path string) (*CollectionState, error) { return state, nil } -func NewPersistentCollectionKB(stateFile, assetDir string, store Engine, maxChunkSize int, llmClient *openai.Client, embeddingModel string) (*PersistentKB, error) { +func NewPersistentCollectionKB(stateFile, assetDir string, store Engine, maxChunkSize, chunkOverlap int, llmClient *openai.Client, embeddingModel string) (*PersistentKB, error) { // if file exists, try to load an existing state // if file does not exist, create a new state if err := os.MkdirAll(assetDir, 0755); err != nil { @@ -70,6 +71,7 @@ func NewPersistentCollectionKB(stateFile, assetDir string, store Engine, maxChun Engine: store, assetDir: assetDir, maxChunkSize: maxChunkSize, + chunkOverlap: chunkOverlap, sources: []*ExternalSource{}, index: map[string][]engine.Result{}, } @@ -86,6 +88,7 @@ func NewPersistentCollectionKB(stateFile, assetDir string, store Engine, maxChun Engine: store, path: stateFile, maxChunkSize: maxChunkSize, + chunkOverlap: chunkOverlap, assetDir: assetDir, sources: state.ExternalSources, index: state.Index, @@ -208,6 +211,50 @@ func (db *PersistentKB) EntryExists(entry string) bool { return false } +// GetEntryContent returns all chunks (content, id, metadata) for the given entry. +// It uses the in-memory index and Engine.GetByID to resolve full chunk data. +func (db *PersistentKB) GetEntryContent(entry string) ([]types.Result, error) { + db.Lock() + defer db.Unlock() + + entry = filepath.Base(entry) + chunkResults, ok := db.index[entry] + if !ok { + return nil, fmt.Errorf("entry not found: %s", entry) + } + + results := make([]types.Result, 0, len(chunkResults)) + for _, r := range chunkResults { + full, err := db.Engine.GetByID(r.ID) + if err != nil { + return nil, fmt.Errorf("failed to get chunk %s: %w", r.ID, err) + } + results = append(results, full) + } + return results, nil +} + +// GetEntryFileContent returns the full content of the stored file (same text that was chunked, without overlap) +// and the number of chunks it occupies. This avoids returning overlapping chunk content. +func (db *PersistentKB) GetEntryFileContent(entry string) (content string, chunkCount int, err error) { + db.Lock() + defer db.Unlock() + + entry = filepath.Base(entry) + chunkResults, ok := db.index[entry] + if !ok { + return "", 0, fmt.Errorf("entry not found: %s", entry) + } + chunkCount = len(chunkResults) + + fpath := filepath.Join(db.assetDir, entry) + content, err = fileToText(fpath) + if err != nil { + return "", 0, err + } + return content, chunkCount, nil +} + // Store stores an entry in the persistent knowledge base. func (db *PersistentKB) Store(entry string, metadata map[string]string) error { db.Lock() @@ -307,7 +354,7 @@ func (db *PersistentKB) store(metadata map[string]string, files ...string) ([]en for _, c := range files { e := filepath.Join(db.assetDir, filepath.Base(c)) - pieces, err := chunkFile(e, db.maxChunkSize) + pieces, err := chunkFile(e, db.maxChunkSize, db.chunkOverlap) if err != nil { return nil, err } @@ -407,58 +454,52 @@ func copyFile(src, dst string) error { return nil } -func chunkFile(fpath string, maxchunksize int) ([]string, error) { +// fileToText extracts the full text from a stored file (same logic as chunkFile but no splitting). +// Used by GetEntryFileContent to return content without chunk overlap. +func fileToText(fpath string) (string, error) { if _, err := os.Stat(fpath); os.IsNotExist(err) { - return nil, fmt.Errorf("file does not exist: %s", fpath) + return "", fmt.Errorf("file does not exist: %s", fpath) } - - // Get file extension: - // If it's a .txt file, read the file and split it into chunks. - // If it's a .pdf file, convert it to text and split it into chunks. - // ... extension := filepath.Ext(fpath) switch extension { case ".pdf": r, err := pdf.Open(fpath) if err != nil { - return nil, err + return "", err } var buf bytes.Buffer b, err := r.GetPlainText() if err != nil { - return nil, err + return "", err } buf.ReadFrom(b) - return chunk.SplitParagraphIntoChunks(buf.String(), maxchunksize), nil + return buf.String(), nil case ".txt", ".md": - xlog.Debug("Reading text file: ", fpath) f, err := os.Open(fpath) if err != nil { - xlog.Error("Error opening file: ", fpath) - return nil, err + return "", err } defer f.Close() content, err := io.ReadAll(f) if err != nil { - xlog.Error("Error reading file: ", fpath) - return nil, err + return "", err } - contentStr := string(content) - chunks := chunk.SplitParagraphIntoChunks(contentStr, maxchunksize) - xlog.Info("Chunked file", "file", fpath, "content_length", len(contentStr), "max_chunk_size", maxchunksize, "chunk_count", len(chunks)) - if len(chunks) > 0 { - xlog.Debug("First chunk length", "length", len(chunks[0])) - if len(chunks) > 1 { - xlog.Debug("Last chunk length", "length", len(chunks[len(chunks)-1])) - } - } - return chunks, nil - + return string(content), nil default: - xlog.Error("Unsupported file type: ", extension) + return "", fmt.Errorf("unsupported file type: %s", extension) + } +} + +func chunkFile(fpath string, maxchunksize, chunkOverlap int) ([]string, error) { + content, err := fileToText(fpath) + if err != nil { + return nil, err } - return nil, fmt.Errorf("not implemented") + opts := chunk.Options{MaxSize: maxchunksize, Overlap: chunkOverlap, SplitLongWords: true} + chunks := chunk.SplitParagraphIntoChunksWithOptions(content, opts) + xlog.Info("Chunked file", "file", fpath, "content_length", len(content), "max_chunk_size", maxchunksize, "chunk_overlap", chunkOverlap, "chunk_count", len(chunks)) + return chunks, nil } // GetExternalSources returns the list of external sources for this collection diff --git a/rag/persistency_test.go b/rag/persistency_test.go index c09407e..512d9a8 100644 --- a/rag/persistency_test.go +++ b/rag/persistency_test.go @@ -78,13 +78,13 @@ var _ = Describe("PersistentKB", func() { Describe("NewPersistentCollectionKB", func() { It("should create a new persistent KB", func() { - kb, err := NewPersistentCollectionKB(stateFile, assetDir, engine, 1000, openaiClient, "granite-embedding-107m-multilingual") + kb, err := NewPersistentCollectionKB(stateFile, assetDir, engine, 1000, 0, openaiClient, "granite-embedding-107m-multilingual") Expect(err).ToNot(HaveOccurred()) Expect(kb).ToNot(BeNil()) }) It("should create state file", func() { - _, err := NewPersistentCollectionKB(stateFile, assetDir, engine, 1000, openaiClient, "granite-embedding-107m-multilingual") + _, err := NewPersistentCollectionKB(stateFile, assetDir, engine, 1000, 0, openaiClient, "granite-embedding-107m-multilingual") Expect(err).ToNot(HaveOccurred()) Expect(stateFile).To(BeAnExistingFile()) }) @@ -95,7 +95,7 @@ var _ = Describe("PersistentKB", func() { BeforeEach(func() { var err error - kb, err = NewPersistentCollectionKB(stateFile, assetDir, engine, 1000, openaiClient, "granite-embedding-107m-multilingual") + kb, err = NewPersistentCollectionKB(stateFile, assetDir, engine, 1000, 0, openaiClient, "granite-embedding-107m-multilingual") Expect(err).ToNot(HaveOccurred()) }) @@ -110,7 +110,7 @@ var _ = Describe("PersistentKB", func() { BeforeEach(func() { var err error - kb, err = NewPersistentCollectionKB(stateFile, assetDir, engine, 1000, openaiClient, "granite-embedding-107m-multilingual") + kb, err = NewPersistentCollectionKB(stateFile, assetDir, engine, 1000, 0, openaiClient, "granite-embedding-107m-multilingual") Expect(err).ToNot(HaveOccurred()) }) @@ -119,4 +119,40 @@ var _ = Describe("PersistentKB", func() { Expect(count).To(Equal(0)) }) }) + + Describe("GetEntryContent", func() { + var kb *PersistentKB + var testFile string + + BeforeEach(func() { + var err error + kb, err = NewPersistentCollectionKB(stateFile, assetDir, engine, 1000, 0, openaiClient, "granite-embedding-107m-multilingual") + Expect(err).ToNot(HaveOccurred()) + + testFile = filepath.Join(tempDir, "getcontent.txt") + err = os.WriteFile(testFile, []byte("This is content for GetEntryContent test."), 0644) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should return entry not found for missing entry", func() { + _, err := kb.GetEntryContent("nonexistent.txt") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("entry not found")) + }) + + It("should return chunks for stored entry", func() { + err := kb.Store(testFile, map[string]string{"type": "test"}) + Expect(err).ToNot(HaveOccurred()) + + results, err := kb.GetEntryContent("getcontent.txt") + Expect(err).ToNot(HaveOccurred()) + Expect(results).ToNot(BeEmpty()) + + var fullContent string + for _, r := range results { + fullContent += r.Content + } + Expect(fullContent).To(ContainSubstring("This is content for GetEntryContent test")) + }) + }) }) diff --git a/routes.go b/routes.go index a124fa1..c94870a 100644 --- a/routes.go +++ b/routes.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "net/http" + "net/url" "os" "path/filepath" "strings" @@ -66,14 +67,14 @@ func errorResponse(code string, message string, details string) APIResponse { func newVectorEngine( vectorEngineType string, llmClient *openai.Client, - apiURL, apiKey, collectionName, dbPath, embeddingModel string, maxChunkSize int) *rag.PersistentKB { + apiURL, apiKey, collectionName, dbPath, embeddingModel string, maxChunkSize, chunkOverlap int) *rag.PersistentKB { switch vectorEngineType { case "chromem": xlog.Info("Chromem collection", "collectionName", collectionName, "dbPath", dbPath) - return rag.NewPersistentChromeCollection(llmClient, collectionName, dbPath, fileAssets, embeddingModel, maxChunkSize) + return rag.NewPersistentChromeCollection(llmClient, collectionName, dbPath, fileAssets, embeddingModel, maxChunkSize, chunkOverlap) case "localai": xlog.Info("LocalAI collection", "collectionName", collectionName, "apiURL", apiURL) - return rag.NewPersistentLocalAICollection(llmClient, apiURL, apiKey, collectionName, dbPath, fileAssets, embeddingModel, maxChunkSize) + return rag.NewPersistentLocalAICollection(llmClient, apiURL, apiKey, collectionName, dbPath, fileAssets, embeddingModel, maxChunkSize, chunkOverlap) case "postgres": databaseURL := os.Getenv("DATABASE_URL") if databaseURL == "" { @@ -81,7 +82,7 @@ func newVectorEngine( os.Exit(1) } xlog.Info("PostgreSQL collection", "collectionName", collectionName, "databaseURL", databaseURL) - return rag.NewPersistentPostgresCollection(llmClient, collectionName, dbPath, fileAssets, embeddingModel, maxChunkSize, databaseURL) + return rag.NewPersistentPostgresCollection(llmClient, collectionName, dbPath, fileAssets, embeddingModel, maxChunkSize, chunkOverlap, databaseURL) default: xlog.Error("Unknown vector engine", "engine", vectorEngineType) os.Exit(1) @@ -91,12 +92,12 @@ func newVectorEngine( } // API routes for managing collections -func registerAPIRoutes(e *echo.Echo, openAIClient *openai.Client, maxChunkingSize int, apiKeys []string) { +func registerAPIRoutes(e *echo.Echo, openAIClient *openai.Client, maxChunkingSize, chunkOverlap int, apiKeys []string) { // Load all collections colls := rag.ListAllCollections(collectionDBPath) for _, c := range colls { - collection := newVectorEngine(vectorEngine, openAIClient, openAIBaseURL, openAIKey, c, collectionDBPath, embeddingModel, maxChunkingSize) + collection := newVectorEngine(vectorEngine, openAIClient, openAIBaseURL, openAIKey, c, collectionDBPath, embeddingModel, maxChunkingSize, chunkOverlap) collections[c] = collection // Register the collection with the source manager sourceManager.RegisterCollection(c, collection) @@ -121,10 +122,11 @@ func registerAPIRoutes(e *echo.Echo, openAIClient *openai.Client, maxChunkingSiz }) } - e.POST("/api/collections", createCollection(collections, openAIClient, embeddingModel, maxChunkingSize)) + e.POST("/api/collections", createCollection(collections, openAIClient, embeddingModel, maxChunkingSize, chunkOverlap)) e.POST("/api/collections/:name/upload", uploadFile(collections, fileAssets)) e.GET("/api/collections", listCollections) e.GET("/api/collections/:name/entries", listFiles(collections)) + e.GET("/api/collections/:name/entries/:entry", getEntryContent(collections)) e.POST("/api/collections/:name/search", search(collections)) e.POST("/api/collections/:name/reset", reset(collections)) e.DELETE("/api/collections/:name/entry/delete", deleteEntryFromCollection(collections)) @@ -134,7 +136,7 @@ func registerAPIRoutes(e *echo.Echo, openAIClient *openai.Client, maxChunkingSiz } // createCollection handles creating a new collection -func createCollection(collections collectionList, client *openai.Client, embeddingModel string, maxChunkingSize int) func(c echo.Context) error { +func createCollection(collections collectionList, client *openai.Client, embeddingModel string, maxChunkingSize, chunkOverlap int) func(c echo.Context) error { return func(c echo.Context) error { type request struct { Name string `json:"name"` @@ -145,7 +147,7 @@ func createCollection(collections collectionList, client *openai.Client, embeddi return c.JSON(http.StatusBadRequest, errorResponse(ErrCodeInvalidRequest, "Invalid request", err.Error())) } - collection := newVectorEngine(vectorEngine, client, openAIBaseURL, openAIKey, r.Name, collectionDBPath, embeddingModel, maxChunkingSize) + collection := newVectorEngine(vectorEngine, client, openAIBaseURL, openAIKey, r.Name, collectionDBPath, embeddingModel, maxChunkingSize, chunkOverlap) collections[r.Name] = collection // Register the new collection with the source manager @@ -271,6 +273,42 @@ func listFiles(collections collectionList) func(c echo.Context) error { } } +// getEntryContent returns the full content of the stored file (no chunk overlap) and the number of chunks it occupies. +func getEntryContent(collections collectionList) func(c echo.Context) error { + return func(c echo.Context) error { + name := c.Param("name") + collection, exists := collections[name] + if !exists { + return c.JSON(http.StatusNotFound, errorResponse(ErrCodeNotFound, "Collection not found", fmt.Sprintf("Collection '%s' does not exist", name))) + } + + entryParam := c.Param("entry") + entry, err := url.PathUnescape(entryParam) + if err != nil { + entry = entryParam + } + + content, chunkCount, err := collection.GetEntryFileContent(entry) + if err != nil { + if strings.Contains(err.Error(), "entry not found") { + return c.JSON(http.StatusNotFound, errorResponse(ErrCodeNotFound, "Entry not found", fmt.Sprintf("Entry '%s' does not exist in collection '%s'", entry, name))) + } + if strings.Contains(err.Error(), "not implemented") || strings.Contains(err.Error(), "unsupported file type") { + return c.JSON(http.StatusNotImplemented, errorResponse(ErrCodeInternalError, "Not supported", err.Error())) + } + return c.JSON(http.StatusInternalServerError, errorResponse(ErrCodeInternalError, "Failed to get entry content", err.Error())) + } + + response := successResponse("Entry content retrieved successfully", map[string]interface{}{ + "collection": name, + "entry": entry, + "content": content, + "chunk_count": chunkCount, + }) + return c.JSON(http.StatusOK, response) + } +} + // uploadFile handles uploading files to a collection func uploadFile(collections collectionList, fileAssets string) func(c echo.Context) error { return func(c echo.Context) error { diff --git a/static/index.html b/static/index.html index 676e01c..aa05290 100644 --- a/static/index.html +++ b/static/index.html @@ -657,14 +657,24 @@
${escapeHtml(content)}`
+ : 'Empty entry.
'; + + Swal.fire({ + title: `Content: ${escapeHtml(entryName)}`, + html: `