mudler · mudler · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/README.md b/README.md
@@ -202,6 +202,7 @@ LocalRecall uses environment variables to configure its behavior. These variable
 | `LISTENING_ADDRESS`         | Address the server listens on (default: `:8080`). Useful for deployments on custom ports or network interfaces. |
 | `VECTOR_ENGINE`             | Vector database engine to use (`chromem` by default, `postgres` for PostgreSQL).                              |
 | `MAX_CHUNKING_SIZE`         | Maximum size (in characters) for breaking down documents into chunks. Affects performance and accuracy.       |
+| `CHUNK_OVERLAP`             | Overlap in characters between consecutive chunks (word-aligned). Default: 0. Use to improve context across chunk boundaries. |
 | `HYBRID_SEARCH_BM25_WEIGHT` | Weight for BM25 keyword search in hybrid search (default: 0.5, PostgreSQL only).                                 |
 | `HYBRID_SEARCH_VECTOR_WEIGHT` | Weight for vector similarity search in hybrid search (default: 0.5, PostgreSQL only).                           |
 | `API_KEYS`                  | Comma-separated list of API keys for securing access to the REST API (optional).                                |
@@ -246,6 +247,14 @@ curl -X GET $BASE_URL/collections
 curl -X GET $BASE_URL/collections/myCollection/entries
 ```
 
+- **Get Entry Content**:
+
+```sh
+curl -X GET $BASE_URL/collections/myCollection/entries/file.txt
+```
+
+Returns `collection`, `entry`, `chunks` (array of `id`, `content`, `metadata`), and `count`.
+
 - **Search Collection**:
 
 ```sh

diff --git a/main.go b/main.go
@@ -21,6 +21,7 @@ var (
 	listeningAddress = os.Getenv("LISTENING_ADDRESS")
 	vectorEngine     = os.Getenv("VECTOR_ENGINE")
 	maxChunkingSize  = os.Getenv("MAX_CHUNKING_SIZE")
+	chunkOverlap     = os.Getenv("CHUNK_OVERLAP")
 	apiKeys          = os.Getenv("API_KEYS")
 	gitPrivateKey    = os.Getenv("GIT_PRIVATE_KEY")
 	sourceManager    = rag.NewSourceManager(&sources.Config{
@@ -77,7 +78,16 @@ func startAPI(listenAddress string) {
 		}
 	}
 
-	registerAPIRoutes(e, openAIClient, chunkingSize, keys)
+	overlap := 0
+	if chunkOverlap != "" {
+		var err error
+		overlap, err = strconv.Atoi(chunkOverlap)
+		if err != nil {
+			e.Logger.Fatal("Failed to convert CHUNK_OVERLAP to integer")
+		}
+	}
+
+	registerAPIRoutes(e, openAIClient, chunkingSize, overlap, keys)
 
 	e.Logger.Fatal(e.Start(listenAddress))
 }

diff --git a/pkg/chunk/chunk_test.go b/pkg/chunk/chunk_test.go
@@ -1,6 +1,8 @@
 package chunk_test
 
 import (
+	"strings"
+
 	. "github.com/mudler/localrecall/pkg/chunk"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -45,5 +47,83 @@ var _ = Describe("Chunk", func() {
 			chunks := SplitParagraphIntoChunks(text, 30)
 			Expect(chunks).ToNot(BeEmpty())
 		})
+
+		It("should split words longer than maxChunkSize into chunks each <= maxChunkSize", func() {
+			text := "normal verylongwordhere end"
+			chunks := SplitParagraphIntoChunks(text, 5)
+			Expect(chunks).ToNot(BeEmpty())
+			for _, c := range chunks {
+				Expect(len(c)).To(BeNumerically("<=", 5))
+			}
+			// "verylongwordhere" (16 chars) with max 5 -> 4 chunks of 5,5,5,1
+			Expect(chunks).To(ContainElement("veryl"))
+			Expect(chunks).To(ContainElement("ongwo"))
+			Expect(chunks).To(ContainElement("rdher"))
+			Expect(chunks).To(ContainElement("e"))
+		})
+
+		It("backward compatibility: SplitParagraphIntoChunks matches Options with Overlap 0", func() {
+			text := "This is a very long text that should be split into multiple chunks."
+			chunksLegacy := SplitParagraphIntoChunks(text, 20)
+			chunksOpts := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 20, Overlap: 0, SplitLongWords: true})
+			Expect(chunksLegacy).To(Equal(chunksOpts))
+		})
+	})
+
+	Describe("SplitParagraphIntoChunksWithOptions", func() {
+		It("should apply overlap between consecutive chunks", func() {
+			text := "one two three four five six seven eight nine ten"
+			chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 20, Overlap: 10})
+			Expect(chunks).ToNot(BeEmpty())
+			for _, c := range chunks {
+				Expect(len(c)).To(BeNumerically("<=", 20))
+			}
+			// Consecutive chunks should share a suffix/prefix
+			for i := 0; i < len(chunks)-1; i++ {
+				tail := chunks[i]
+				head := chunks[i+1]
+				// Some overlap: tail of chunk i should appear at start of chunk i+1
+				found := false
+				for n := 1; n <= len(tail) && n <= len(head); n++ {
+					if tail[len(tail)-n:] == head[:n] {
+						found = true
+						break
+					}
+				}
+				// Or head starts with last words of tail (word-aligned)
+				wordsTail := strings.Fields(tail)
+				if len(wordsTail) > 0 {
+					lastWord := wordsTail[len(wordsTail)-1]
+					if strings.HasPrefix(head, lastWord) || head == lastWord {
+						found = true
+					}
+				}
+				Expect(found).To(BeTrue(), "chunk %d and %d should share overlap", i, i+1)
+			}
+		})
+
+		It("Overlap 0 matches no overlap", func() {
+			text := "a b c d e f g h i j k l m n o p"
+			chunksOverlap := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 10, Overlap: 0})
+			chunksLegacy := SplitParagraphIntoChunks(text, 10)
+			Expect(chunksOverlap).To(Equal(chunksLegacy))
+		})
+
+		It("SplitLongWords false allows a single word to exceed MaxSize as one chunk", func() {
+			text := "short verylongwordhere end"
+			chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 5, Overlap: 0, SplitLongWords: false})
+			Expect(chunks).To(ContainElement("verylongwordhere"))
+			Expect(chunks).To(ContainElement("short"))
+			Expect(chunks).To(ContainElement("end"))
+		})
+
+		It("clamps Overlap >= MaxSize to MaxSize-1", func() {
+			text := "one two three four five"
+			chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 10, Overlap: 99})
+			Expect(chunks).ToNot(BeEmpty())
+			for _, c := range chunks {
+				Expect(len(c)).To(BeNumerically("<=", 10))
+			}
+		})
 	})
 })
diff --git a/pkg/chunk/chunking.go b/pkg/chunk/chunking.go
@@ -4,43 +4,188 @@ import (
 	"strings"
 )
 
-// SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input,
-// and returns a slice of strings where each string is a chunk of the paragraph
-// that is at most maxChunkSize long, ensuring that words are not split.
-func SplitParagraphIntoChunks(paragraph string, maxChunkSize int) []string {
-	if len(paragraph) <= maxChunkSize {
-		return []string{paragraph}
+// Options configures paragraph chunking.
+type Options struct {
+	// MaxSize is the maximum characters per chunk (required, must be > 0).
+	MaxSize int
+	// Overlap is the overlap in characters between consecutive chunks, word-aligned (0 = no overlap).
+	// Must be < MaxSize; values >= MaxSize are clamped to MaxSize-1.
+	Overlap int
+	// SplitLongWords, when true, splits words longer than MaxSize into smaller chunks so no chunk exceeds MaxSize (default true).
+	SplitLongWords bool
+}
+
+// splitLongString splits s into pieces of at most maxSize characters.
+// Returns a slice of substrings; each has length <= maxSize.
+func splitLongString(s string, maxSize int) []string {
+	if maxSize <= 0 || len(s) <= maxSize {
+		return []string{s}
+	}
+	var pieces []string
+	for len(s) > 0 {
+		n := maxSize
+		if n > len(s) {
+			n = len(s)
+		}
+		pieces = append(pieces, s[:n])
+		s = s[n:]
 	}
+	return pieces
+}
 
+// overlapTail returns the suffix of chunk that is at most overlap characters and word-aligned (whole words only).
+// If overlap is 0 or chunk is empty, returns "".
+func overlapTail(chunk string, overlap int) string {
+	if overlap <= 0 || chunk == "" {
+		return ""
+	}
+	words := strings.Fields(chunk)
+	if len(words) == 0 {
+		return ""
+	}
+	// Take words from the end until we would exceed overlap (length includes spaces between words).
+	var tail []string
+	length := 0
+	for i := len(words) - 1; i >= 0; i-- {
+		w := words[i]
+		addLen := len(w)
+		if len(tail) > 0 {
+			addLen++ // space before this word
+		}
+		if length+addLen > overlap {
+			break
+		}
+		tail = append([]string{w}, tail...)
+		length += addLen
+	}
+	return strings.Join(tail, " ")
+}
+
+// SplitParagraphIntoChunksWithOptions splits a paragraph into chunks according to opts.
+// Chunks are word-boundary aligned; consecutive chunks may overlap by opts.Overlap characters (word-aligned).
+// Words longer than opts.MaxSize are split into smaller chunks when opts.SplitLongWords is true.
+func SplitParagraphIntoChunksWithOptions(paragraph string, opts Options) []string {
+	maxSize := opts.MaxSize
+	if maxSize <= 0 {
+		maxSize = 1
+	}
+	overlap := opts.Overlap
+	if overlap >= maxSize {
+		overlap = maxSize - 1
+	}
+	if overlap < 0 {
+		overlap = 0
+	}
+	splitLongWords := opts.SplitLongWords
+
+	// Empty or single-chunk within limit (no overlap needed)
+	if paragraph == "" {
+		return []string{""}
+	}
+	if len(paragraph) <= maxSize && overlap == 0 {
+		words := strings.Fields(paragraph)
+		needSplit := false
+		for _, w := range words {
+			if len(w) > maxSize && splitLongWords {
+				needSplit = true
+				break
+			}
+		}
+		if !needSplit {
+			return []string{paragraph}
+		}
+	}
+
+	words := strings.Fields(paragraph)
 	var chunks []string
 	var currentChunk strings.Builder
-
-	words := strings.Fields(paragraph) // Splits the paragraph into words.
+	var overlapPrefix string // word-aligned prefix for next chunk (from previous chunk's tail)
 
 	for _, word := range words {
-		// If adding the next word would exceed maxChunkSize (considering a space if not the first word in a chunk),
-		// add the currentChunk to chunks, and reset currentChunk.
-		if currentChunk.Len() > 0 && currentChunk.Len()+len(word)+1 > maxChunkSize { // +1 for the space if not the first word
-			chunks = append(chunks, currentChunk.String())
-			currentChunk.Reset()
-		} else if currentChunk.Len() == 0 && len(word) > maxChunkSize { // Word itself exceeds maxChunkSize, split the word
-			chunks = append(chunks, word)
+		// Long word: split into pieces when SplitLongWords is true
+		if len(word) > maxSize && splitLongWords {
+			// Flush current chunk first
+			if currentChunk.Len() > 0 {
+				chunks = append(chunks, currentChunk.String())
+				if overlap > 0 {
+					overlapPrefix = overlapTail(currentChunk.String(), overlap)
+				} else {
+					overlapPrefix = ""
+				}
+				currentChunk.Reset()
+			}
+			pieces := splitLongString(word, maxSize)
+			for _, p := range pieces {
+				chunks = append(chunks, p)
+				if overlap > 0 {
+					overlapPrefix = overlapTail(p, overlap)
+				}
+			}
 			continue
 		}
 
-		// Add a space before the word if it's not the beginning of a new chunk.
+		// Normal word: compute length if we add this word
+		var nextLen int
 		if currentChunk.Len() > 0 {
-			currentChunk.WriteString(" ")
+			nextLen = currentChunk.Len() + 1 + len(word)
+		} else if overlapPrefix != "" {
+			nextLen = len(overlapPrefix) + 1 + len(word)
+		} else {
+			nextLen = len(word)
 		}
 
-		// Add the word to the current chunk.
-		currentChunk.WriteString(word)
+		if nextLen > maxSize {
+			// Flush current chunk
+			if currentChunk.Len() > 0 {
+				chunks = append(chunks, currentChunk.String())
+				if overlap > 0 {
+					overlapPrefix = overlapTail(currentChunk.String(), overlap)
+				} else {
+					overlapPrefix = ""
+				}
+				currentChunk.Reset()
+			}
+			// Start new chunk with overlap prefix only if it fits with the word
+			if overlapPrefix != "" && len(overlapPrefix)+1+len(word) <= maxSize {
+				currentChunk.WriteString(overlapPrefix)
+				currentChunk.WriteString(" ")
+				currentChunk.WriteString(word)
+				overlapPrefix = ""
+			} else {
+				currentChunk.WriteString(word)
+				overlapPrefix = ""
+			}
+		} else {
+			if currentChunk.Len() == 0 && overlapPrefix != "" {
+				currentChunk.WriteString(overlapPrefix)
+				currentChunk.WriteString(" ")
+				currentChunk.WriteString(word)
+				overlapPrefix = ""
+			} else if currentChunk.Len() > 0 {
+				currentChunk.WriteString(" ")
+				currentChunk.WriteString(word)
+			} else {
+				currentChunk.WriteString(word)
+			}
+		}
 	}
 
-	// After the loop, add any remaining content in currentChunk to chunks.
 	if currentChunk.Len() > 0 {
 		chunks = append(chunks, currentChunk.String())
 	}
 
 	return chunks
 }
+
+// SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input,
+// and returns a slice of strings where each string is a chunk of the paragraph
+// that is at most maxChunkSize long, ensuring that words are not split.
+// Words longer than maxChunkSize are split into smaller chunks.
+// For overlap and other options, use SplitParagraphIntoChunksWithOptions.
+func SplitParagraphIntoChunks(paragraph string, maxChunkSize int) []string {
+	return SplitParagraphIntoChunksWithOptions(paragraph, Options{
+		MaxSize:        maxChunkSize,
+		Overlap:        0,
+		SplitLongWords: true,
+	})
+}