Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
package parse_test
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"qbank/internal/parse"
|
||||
)
|
||||
|
||||
func TestChunk(t *testing.T) {
|
||||
t.Run("small text stays in one chunk", func(t *testing.T) {
|
||||
text := "Para one.\n\nPara two.\n\nPara three."
|
||||
chunks := parse.Chunk(text, 1000)
|
||||
if len(chunks) != 1 {
|
||||
t.Errorf("want 1 chunk, got %d: %v", len(chunks), chunks)
|
||||
}
|
||||
if !strings.Contains(chunks[0], "Para one") || !strings.Contains(chunks[0], "Para three") {
|
||||
t.Errorf("content lost: %q", chunks[0])
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("paragraphs split when over limit", func(t *testing.T) {
|
||||
para := strings.Repeat("x", 600)
|
||||
text := para + "\n\n" + para + "\n\n" + para
|
||||
chunks := parse.Chunk(text, 1000)
|
||||
if len(chunks) < 2 {
|
||||
t.Errorf("want ≥2 chunks for 1800-rune input with 1000 limit, got %d", len(chunks))
|
||||
}
|
||||
// No chunk should combine paragraphs past the limit
|
||||
for i, c := range chunks {
|
||||
if len([]rune(c)) > 1200 {
|
||||
t.Errorf("chunk %d is %d runes, too large", i, len([]rune(c)))
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("single oversized paragraph kept as own chunk", func(t *testing.T) {
|
||||
bigPara := strings.Repeat("x", 2000)
|
||||
chunks := parse.Chunk(bigPara, 1000)
|
||||
if len(chunks) != 1 {
|
||||
t.Errorf("want 1 chunk for single oversized para, got %d", len(chunks))
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("empty paragraphs ignored", func(t *testing.T) {
|
||||
text := "\n\nPara one.\n\n\n\nPara two.\n\n"
|
||||
chunks := parse.Chunk(text, 1000)
|
||||
if len(chunks) != 1 {
|
||||
t.Errorf("want 1 chunk after ignoring blanks, got %d", len(chunks))
|
||||
}
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user