Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 13:03:04 +03:00
parent d9de37d3d8
commit e53e7662e9
13 changed files with 628 additions and 0 deletions
@@ -0,0 +1,31 @@
+package parse
+
+import "strings"
+
+// Chunk splits text on double-newlines and builds chunks of at most maxRunes.
+// A single paragraph longer than maxRunes is kept as its own chunk.
+func Chunk(text string, maxRunes int) []string {
+	paragraphs := strings.Split(text, "\n\n")
+	var chunks []string
+	var cur strings.Builder
+
+	for _, p := range paragraphs {
+		p = strings.TrimSpace(p)
+		if p == "" {
+			continue
+		}
+		pLen := len([]rune(p))
+		if cur.Len() > 0 && len([]rune(cur.String()))+2+pLen > maxRunes {
+			chunks = append(chunks, cur.String())
+			cur.Reset()
+		}
+		if cur.Len() > 0 {
+			cur.WriteString("\n\n")
+		}
+		cur.WriteString(p)
+	}
+	if cur.Len() > 0 {
+		chunks = append(chunks, cur.String())
+	}
+	return chunks
+}