Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,31 @@
|
||||
package parse
|
||||
|
||||
import "strings"
|
||||
|
||||
// Chunk splits text on double-newlines and builds chunks of at most maxRunes.
|
||||
// A single paragraph longer than maxRunes is kept as its own chunk.
|
||||
func Chunk(text string, maxRunes int) []string {
|
||||
paragraphs := strings.Split(text, "\n\n")
|
||||
var chunks []string
|
||||
var cur strings.Builder
|
||||
|
||||
for _, p := range paragraphs {
|
||||
p = strings.TrimSpace(p)
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
pLen := len([]rune(p))
|
||||
if cur.Len() > 0 && len([]rune(cur.String()))+2+pLen > maxRunes {
|
||||
chunks = append(chunks, cur.String())
|
||||
cur.Reset()
|
||||
}
|
||||
if cur.Len() > 0 {
|
||||
cur.WriteString("\n\n")
|
||||
}
|
||||
cur.WriteString(p)
|
||||
}
|
||||
if cur.Len() > 0 {
|
||||
chunks = append(chunks, cur.String())
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
Reference in New Issue
Block a user