Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Jānis Kacēns
2026-05-11 13:03:04 +03:00
parent d9de37d3d8
commit e53e7662e9
13 changed files with 628 additions and 0 deletions
+31
View File
@@ -0,0 +1,31 @@
package parse
import "strings"
// Chunk splits text on double-newlines and builds chunks of at most maxRunes.
// A single paragraph longer than maxRunes is kept as its own chunk.
func Chunk(text string, maxRunes int) []string {
paragraphs := strings.Split(text, "\n\n")
var chunks []string
var cur strings.Builder
for _, p := range paragraphs {
p = strings.TrimSpace(p)
if p == "" {
continue
}
pLen := len([]rune(p))
if cur.Len() > 0 && len([]rune(cur.String()))+2+pLen > maxRunes {
chunks = append(chunks, cur.String())
cur.Reset()
}
if cur.Len() > 0 {
cur.WriteString("\n\n")
}
cur.WriteString(p)
}
if cur.Len() > 0 {
chunks = append(chunks, cur.String())
}
return chunks
}