Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ LocalRecall uses environment variables to configure its behavior. These variable
| `LISTENING_ADDRESS` | Address the server listens on (default: `:8080`). Useful for deployments on custom ports or network interfaces. |
| `VECTOR_ENGINE` | Vector database engine to use (`chromem` by default, `postgres` for PostgreSQL). |
| `MAX_CHUNKING_SIZE` | Maximum size (in characters) for breaking down documents into chunks. Affects performance and accuracy. |
| `CHUNK_OVERLAP` | Overlap in characters between consecutive chunks (word-aligned). Default: 0. Use to improve context across chunk boundaries. |
| `HYBRID_SEARCH_BM25_WEIGHT` | Weight for BM25 keyword search in hybrid search (default: 0.5, PostgreSQL only). |
| `HYBRID_SEARCH_VECTOR_WEIGHT` | Weight for vector similarity search in hybrid search (default: 0.5, PostgreSQL only). |
| `API_KEYS` | Comma-separated list of API keys for securing access to the REST API (optional). |
Expand Down Expand Up @@ -246,6 +247,14 @@ curl -X GET $BASE_URL/collections
curl -X GET $BASE_URL/collections/myCollection/entries
```

- **Get Entry Content**:

```sh
curl -X GET $BASE_URL/collections/myCollection/entries/file.txt
```

Returns `collection`, `entry`, `chunks` (array of `id`, `content`, `metadata`), and `count`.

- **Search Collection**:

```sh
Expand Down
12 changes: 11 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ var (
listeningAddress = os.Getenv("LISTENING_ADDRESS")
vectorEngine = os.Getenv("VECTOR_ENGINE")
maxChunkingSize = os.Getenv("MAX_CHUNKING_SIZE")
chunkOverlap = os.Getenv("CHUNK_OVERLAP")
apiKeys = os.Getenv("API_KEYS")
gitPrivateKey = os.Getenv("GIT_PRIVATE_KEY")
sourceManager = rag.NewSourceManager(&sources.Config{
Expand Down Expand Up @@ -77,7 +78,16 @@ func startAPI(listenAddress string) {
}
}

registerAPIRoutes(e, openAIClient, chunkingSize, keys)
overlap := 0
if chunkOverlap != "" {
var err error
overlap, err = strconv.Atoi(chunkOverlap)
if err != nil {
e.Logger.Fatal("Failed to convert CHUNK_OVERLAP to integer")
}
}

registerAPIRoutes(e, openAIClient, chunkingSize, overlap, keys)

e.Logger.Fatal(e.Start(listenAddress))
}
Expand Down
80 changes: 80 additions & 0 deletions pkg/chunk/chunk_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package chunk_test

import (
"strings"

. "github.com/mudler/localrecall/pkg/chunk"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
Expand Down Expand Up @@ -45,5 +47,83 @@ var _ = Describe("Chunk", func() {
chunks := SplitParagraphIntoChunks(text, 30)
Expect(chunks).ToNot(BeEmpty())
})

It("should split words longer than maxChunkSize into chunks each <= maxChunkSize", func() {
text := "normal verylongwordhere end"
chunks := SplitParagraphIntoChunks(text, 5)
Expect(chunks).ToNot(BeEmpty())
for _, c := range chunks {
Expect(len(c)).To(BeNumerically("<=", 5))
}
// "verylongwordhere" (16 chars) with max 5 -> 4 chunks of 5,5,5,1
Expect(chunks).To(ContainElement("veryl"))
Expect(chunks).To(ContainElement("ongwo"))
Expect(chunks).To(ContainElement("rdher"))
Expect(chunks).To(ContainElement("e"))
})

It("backward compatibility: SplitParagraphIntoChunks matches Options with Overlap 0", func() {
text := "This is a very long text that should be split into multiple chunks."
chunksLegacy := SplitParagraphIntoChunks(text, 20)
chunksOpts := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 20, Overlap: 0, SplitLongWords: true})
Expect(chunksLegacy).To(Equal(chunksOpts))
})
})

Describe("SplitParagraphIntoChunksWithOptions", func() {
It("should apply overlap between consecutive chunks", func() {
text := "one two three four five six seven eight nine ten"
chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 20, Overlap: 10})
Expect(chunks).ToNot(BeEmpty())
for _, c := range chunks {
Expect(len(c)).To(BeNumerically("<=", 20))
}
// Consecutive chunks should share a suffix/prefix
for i := 0; i < len(chunks)-1; i++ {
tail := chunks[i]
head := chunks[i+1]
// Some overlap: tail of chunk i should appear at start of chunk i+1
found := false
for n := 1; n <= len(tail) && n <= len(head); n++ {
if tail[len(tail)-n:] == head[:n] {
found = true
break
}
}
// Or head starts with last words of tail (word-aligned)
wordsTail := strings.Fields(tail)
if len(wordsTail) > 0 {
lastWord := wordsTail[len(wordsTail)-1]
if strings.HasPrefix(head, lastWord) || head == lastWord {
found = true
}
}
Expect(found).To(BeTrue(), "chunk %d and %d should share overlap", i, i+1)
}
})

It("Overlap 0 matches no overlap", func() {
text := "a b c d e f g h i j k l m n o p"
chunksOverlap := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 10, Overlap: 0})
chunksLegacy := SplitParagraphIntoChunks(text, 10)
Expect(chunksOverlap).To(Equal(chunksLegacy))
})

It("SplitLongWords false allows a single word to exceed MaxSize as one chunk", func() {
text := "short verylongwordhere end"
chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 5, Overlap: 0, SplitLongWords: false})
Expect(chunks).To(ContainElement("verylongwordhere"))
Expect(chunks).To(ContainElement("short"))
Expect(chunks).To(ContainElement("end"))
})

It("clamps Overlap >= MaxSize to MaxSize-1", func() {
text := "one two three four five"
chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 10, Overlap: 99})
Expect(chunks).ToNot(BeEmpty())
for _, c := range chunks {
Expect(len(c)).To(BeNumerically("<=", 10))
}
})
})
})
185 changes: 165 additions & 20 deletions pkg/chunk/chunking.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,43 +4,188 @@ import (
"strings"
)

// SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input,
// and returns a slice of strings where each string is a chunk of the paragraph
// that is at most maxChunkSize long, ensuring that words are not split.
func SplitParagraphIntoChunks(paragraph string, maxChunkSize int) []string {
if len(paragraph) <= maxChunkSize {
return []string{paragraph}
// Options configures paragraph chunking.
type Options struct {
// MaxSize is the maximum characters per chunk (required, must be > 0).
MaxSize int
// Overlap is the overlap in characters between consecutive chunks, word-aligned (0 = no overlap).
// Must be < MaxSize; values >= MaxSize are clamped to MaxSize-1.
Overlap int
// SplitLongWords, when true, splits words longer than MaxSize into smaller chunks so no chunk exceeds MaxSize (default true).
SplitLongWords bool
}

// splitLongString splits s into pieces of at most maxSize characters.
// Returns a slice of substrings; each has length <= maxSize.
func splitLongString(s string, maxSize int) []string {
if maxSize <= 0 || len(s) <= maxSize {
return []string{s}
}
var pieces []string
for len(s) > 0 {
n := maxSize
if n > len(s) {
n = len(s)
}
pieces = append(pieces, s[:n])
s = s[n:]
}
return pieces
}

// overlapTail returns the suffix of chunk that is at most overlap characters and word-aligned (whole words only).
// If overlap is 0 or chunk is empty, returns "".
func overlapTail(chunk string, overlap int) string {
if overlap <= 0 || chunk == "" {
return ""
}
words := strings.Fields(chunk)
if len(words) == 0 {
return ""
}
// Take words from the end until we would exceed overlap (length includes spaces between words).
var tail []string
length := 0
for i := len(words) - 1; i >= 0; i-- {
w := words[i]
addLen := len(w)
if len(tail) > 0 {
addLen++ // space before this word
}
if length+addLen > overlap {
break
}
tail = append([]string{w}, tail...)
length += addLen
}
return strings.Join(tail, " ")
}

// SplitParagraphIntoChunksWithOptions splits a paragraph into chunks according to opts.
// Chunks are word-boundary aligned; consecutive chunks may overlap by opts.Overlap characters (word-aligned).
// Words longer than opts.MaxSize are split into smaller chunks when opts.SplitLongWords is true.
func SplitParagraphIntoChunksWithOptions(paragraph string, opts Options) []string {
maxSize := opts.MaxSize
if maxSize <= 0 {
maxSize = 1
}
overlap := opts.Overlap
if overlap >= maxSize {
overlap = maxSize - 1
}
if overlap < 0 {
overlap = 0
}
splitLongWords := opts.SplitLongWords

// Empty or single-chunk within limit (no overlap needed)
if paragraph == "" {
return []string{""}
}
if len(paragraph) <= maxSize && overlap == 0 {
words := strings.Fields(paragraph)
needSplit := false
for _, w := range words {
if len(w) > maxSize && splitLongWords {
needSplit = true
break
}
}
if !needSplit {
return []string{paragraph}
}
}

words := strings.Fields(paragraph)
var chunks []string
var currentChunk strings.Builder

words := strings.Fields(paragraph) // Splits the paragraph into words.
var overlapPrefix string // word-aligned prefix for next chunk (from previous chunk's tail)

for _, word := range words {
// If adding the next word would exceed maxChunkSize (considering a space if not the first word in a chunk),
// add the currentChunk to chunks, and reset currentChunk.
if currentChunk.Len() > 0 && currentChunk.Len()+len(word)+1 > maxChunkSize { // +1 for the space if not the first word
chunks = append(chunks, currentChunk.String())
currentChunk.Reset()
} else if currentChunk.Len() == 0 && len(word) > maxChunkSize { // Word itself exceeds maxChunkSize, split the word
chunks = append(chunks, word)
// Long word: split into pieces when SplitLongWords is true
if len(word) > maxSize && splitLongWords {
// Flush current chunk first
if currentChunk.Len() > 0 {
chunks = append(chunks, currentChunk.String())
if overlap > 0 {
overlapPrefix = overlapTail(currentChunk.String(), overlap)
} else {
overlapPrefix = ""
}
currentChunk.Reset()
}
pieces := splitLongString(word, maxSize)
for _, p := range pieces {
chunks = append(chunks, p)
if overlap > 0 {
overlapPrefix = overlapTail(p, overlap)
}
}
continue
}

// Add a space before the word if it's not the beginning of a new chunk.
// Normal word: compute length if we add this word
var nextLen int
if currentChunk.Len() > 0 {
currentChunk.WriteString(" ")
nextLen = currentChunk.Len() + 1 + len(word)
} else if overlapPrefix != "" {
nextLen = len(overlapPrefix) + 1 + len(word)
} else {
nextLen = len(word)
}

// Add the word to the current chunk.
currentChunk.WriteString(word)
if nextLen > maxSize {
// Flush current chunk
if currentChunk.Len() > 0 {
chunks = append(chunks, currentChunk.String())
if overlap > 0 {
overlapPrefix = overlapTail(currentChunk.String(), overlap)
} else {
overlapPrefix = ""
}
currentChunk.Reset()
}
// Start new chunk with overlap prefix only if it fits with the word
if overlapPrefix != "" && len(overlapPrefix)+1+len(word) <= maxSize {
currentChunk.WriteString(overlapPrefix)
currentChunk.WriteString(" ")
currentChunk.WriteString(word)
overlapPrefix = ""
} else {
currentChunk.WriteString(word)
overlapPrefix = ""
}
} else {
if currentChunk.Len() == 0 && overlapPrefix != "" {
currentChunk.WriteString(overlapPrefix)
currentChunk.WriteString(" ")
currentChunk.WriteString(word)
overlapPrefix = ""
} else if currentChunk.Len() > 0 {
currentChunk.WriteString(" ")
currentChunk.WriteString(word)
} else {
currentChunk.WriteString(word)
}
}
}

// After the loop, add any remaining content in currentChunk to chunks.
if currentChunk.Len() > 0 {
chunks = append(chunks, currentChunk.String())
}

return chunks
}

// SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input,
// and returns a slice of strings where each string is a chunk of the paragraph
// that is at most maxChunkSize long, ensuring that words are not split.
// Words longer than maxChunkSize are split into smaller chunks.
// For overlap and other options, use SplitParagraphIntoChunksWithOptions.
func SplitParagraphIntoChunks(paragraph string, maxChunkSize int) []string {
return SplitParagraphIntoChunksWithOptions(paragraph, Options{
MaxSize: maxChunkSize,
Overlap: 0,
SplitLongWords: true,
})
}
Loading
Loading