Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Jānis Kacēns
2026-05-11 13:03:04 +03:00
parent d9de37d3d8
commit e53e7662e9
13 changed files with 628 additions and 0 deletions
+53
View File
@@ -0,0 +1,53 @@
package parse_test
import (
"bytes"
"strings"
"testing"
"qbank/internal/parse"
)
// TestAcceptanceDOCXPipeline verifies the full DOCX → text → chunk pipeline
// using a handcrafted in-memory docx with known content.
func TestAcceptanceDOCXPipeline(t *testing.T) {
const docXML = `<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>1. Which keyword declares a variable in Go?</w:t></w:r></w:p>
<w:p><w:r><w:t>A) var</w:t></w:r></w:p>
<w:p><w:r><w:t>B) let</w:t></w:r></w:p>
<w:p><w:r><w:t>C) dim</w:t></w:r></w:p>
<w:p><w:r><w:t>Correct: A</w:t></w:r></w:p>
<w:p><w:r><w:t>2. What does fmt.Println return?</w:t></w:r></w:p>
<w:p><w:r><w:t>A) Nothing</w:t></w:r></w:p>
<w:p><w:r><w:t>B) n int, err error</w:t></w:r></w:p>
<w:p><w:r><w:t>Correct: B</w:t></w:r></w:p>
</w:body>
</w:document>`
docx := buildDocx(t, docXML)
text, err := parse.ExtractDOCX(bytes.NewReader(docx))
if err != nil {
t.Fatalf("ExtractDOCX: %v", err)
}
wantPhrases := []string{
"Which keyword declares a variable",
"fmt.Println",
"n int, err error",
}
for _, phrase := range wantPhrases {
if !strings.Contains(text, phrase) {
t.Errorf("text missing %q\nfull text:\n%s", phrase, text)
}
}
// Chunking should produce at least 1 chunk.
chunks := parse.Chunk(text, 10_000)
if len(chunks) == 0 {
t.Error("Chunk returned 0 chunks for non-empty text")
}
t.Logf("extracted %d chars, %d chunk(s)", len(text), len(chunks))
}
+31
View File
@@ -0,0 +1,31 @@
package parse
import "strings"
// Chunk splits text on double-newlines and builds chunks of at most maxRunes.
// A single paragraph longer than maxRunes is kept as its own chunk.
func Chunk(text string, maxRunes int) []string {
paragraphs := strings.Split(text, "\n\n")
var chunks []string
var cur strings.Builder
for _, p := range paragraphs {
p = strings.TrimSpace(p)
if p == "" {
continue
}
pLen := len([]rune(p))
if cur.Len() > 0 && len([]rune(cur.String()))+2+pLen > maxRunes {
chunks = append(chunks, cur.String())
cur.Reset()
}
if cur.Len() > 0 {
cur.WriteString("\n\n")
}
cur.WriteString(p)
}
if cur.Len() > 0 {
chunks = append(chunks, cur.String())
}
return chunks
}
+52
View File
@@ -0,0 +1,52 @@
package parse_test
import (
"strings"
"testing"
"qbank/internal/parse"
)
func TestChunk(t *testing.T) {
t.Run("small text stays in one chunk", func(t *testing.T) {
text := "Para one.\n\nPara two.\n\nPara three."
chunks := parse.Chunk(text, 1000)
if len(chunks) != 1 {
t.Errorf("want 1 chunk, got %d: %v", len(chunks), chunks)
}
if !strings.Contains(chunks[0], "Para one") || !strings.Contains(chunks[0], "Para three") {
t.Errorf("content lost: %q", chunks[0])
}
})
t.Run("paragraphs split when over limit", func(t *testing.T) {
para := strings.Repeat("x", 600)
text := para + "\n\n" + para + "\n\n" + para
chunks := parse.Chunk(text, 1000)
if len(chunks) < 2 {
t.Errorf("want ≥2 chunks for 1800-rune input with 1000 limit, got %d", len(chunks))
}
// No chunk should combine paragraphs past the limit
for i, c := range chunks {
if len([]rune(c)) > 1200 {
t.Errorf("chunk %d is %d runes, too large", i, len([]rune(c)))
}
}
})
t.Run("single oversized paragraph kept as own chunk", func(t *testing.T) {
bigPara := strings.Repeat("x", 2000)
chunks := parse.Chunk(bigPara, 1000)
if len(chunks) != 1 {
t.Errorf("want 1 chunk for single oversized para, got %d", len(chunks))
}
})
t.Run("empty paragraphs ignored", func(t *testing.T) {
text := "\n\nPara one.\n\n\n\nPara two.\n\n"
chunks := parse.Chunk(text, 1000)
if len(chunks) != 1 {
t.Errorf("want 1 chunk after ignoring blanks, got %d", len(chunks))
}
})
}
+74
View File
@@ -0,0 +1,74 @@
package parse
import (
"archive/zip"
"bytes"
"encoding/xml"
"errors"
"fmt"
"io"
"strings"
)
// ExtractDOCX reads a .docx file and returns its plain text.
// DOCX is a ZIP archive; we unzip word/document.xml, walk <w:t> nodes
// for text, and emit a newline at each <w:p> boundary.
func ExtractDOCX(r io.Reader) (string, error) {
data, err := io.ReadAll(r)
if err != nil {
return "", fmt.Errorf("read docx: %w", err)
}
zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
if err != nil {
return "", fmt.Errorf("open docx zip: %w", err)
}
var docFile *zip.File
for _, f := range zr.File {
if f.Name == "word/document.xml" {
docFile = f
break
}
}
if docFile == nil {
return "", errors.New("word/document.xml not found in docx")
}
rc, err := docFile.Open()
if err != nil {
return "", fmt.Errorf("open document.xml: %w", err)
}
defer rc.Close()
return parseDocXML(rc)
}
func parseDocXML(r io.Reader) (string, error) {
dec := xml.NewDecoder(r)
var sb strings.Builder
var inText bool
for {
tok, err := dec.Token()
if err == io.EOF {
break
}
if err != nil {
return "", fmt.Errorf("parse document.xml: %w", err)
}
switch t := tok.(type) {
case xml.StartElement:
if t.Name.Local == "t" {
inText = true
}
case xml.EndElement:
if t.Name.Local == "t" {
inText = false
}
if t.Name.Local == "p" {
sb.WriteByte('\n')
}
case xml.CharData:
if inText {
sb.Write([]byte(t))
}
}
}
return strings.TrimSpace(sb.String()), nil
}
+62
View File
@@ -0,0 +1,62 @@
package parse_test
import (
"archive/zip"
"bytes"
"strings"
"testing"
"qbank/internal/parse"
)
func TestExtractDOCX(t *testing.T) {
const docXML = `<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>Question 1: What is Go?</w:t></w:r></w:p>
<w:p><w:r><w:t>A) A compiled language</w:t></w:r></w:p>
<w:p><w:r><w:t>B) An interpreted language</w:t></w:r></w:p>
<w:p><w:r><w:t>C) A markup language</w:t></w:r></w:p>
</w:body>
</w:document>`
docx := buildDocx(t, docXML)
text, err := parse.ExtractDOCX(bytes.NewReader(docx))
if err != nil {
t.Fatalf("ExtractDOCX: %v", err)
}
for _, want := range []string{"Question 1", "compiled language", "interpreted language"} {
if !strings.Contains(text, want) {
t.Errorf("output missing %q; got:\n%s", want, text)
}
}
}
func TestExtractDOCX_MissingXML(t *testing.T) {
var buf bytes.Buffer
w := zip.NewWriter(&buf)
w.Close()
_, err := parse.ExtractDOCX(bytes.NewReader(buf.Bytes()))
if err == nil {
t.Error("expected error for docx without document.xml")
}
}
func buildDocx(t *testing.T, xmlContent string) []byte {
t.Helper()
var buf bytes.Buffer
w := zip.NewWriter(&buf)
f, err := w.Create("word/document.xml")
if err != nil {
t.Fatal(err)
}
if _, err := f.Write([]byte(xmlContent)); err != nil {
t.Fatal(err)
}
if err := w.Close(); err != nil {
t.Fatal(err)
}
return buf.Bytes()
}
+62
View File
@@ -0,0 +1,62 @@
package parse
import (
"bytes"
"errors"
"fmt"
"io"
"strings"
"unicode"
"github.com/ledongthuc/pdf"
)
// ErrScanPDF is returned when extracted text is empty or non-textual,
// indicating a scan-based (image-only) PDF that cannot be parsed.
var ErrScanPDF = errors.New("scan-based PDF: please convert to text first")
// ExtractPDF reads a PDF and returns its concatenated plain text.
// Returns ErrScanPDF if the content appears to be empty or non-textual.
func ExtractPDF(r io.Reader) (string, error) {
data, err := io.ReadAll(r)
if err != nil {
return "", fmt.Errorf("read pdf: %w", err)
}
reader, err := pdf.NewReader(bytes.NewReader(data), int64(len(data)))
if err != nil {
return "", fmt.Errorf("parse pdf: %w", err)
}
var sb strings.Builder
for i := 1; i <= reader.NumPage(); i++ {
page := reader.Page(i)
if page.V.IsNull() {
continue
}
text, err := page.GetPlainText(nil)
if err != nil {
continue
}
sb.WriteString(text)
sb.WriteByte('\n')
}
text := sb.String()
if isGibberish(text) {
return "", ErrScanPDF
}
return text, nil
}
// isGibberish returns true when text is too short or has < 2% alphanumeric content.
func isGibberish(text string) bool {
runes := []rune(text)
if len(runes) < 50 {
return true
}
var alpha int
for _, c := range runes {
if unicode.IsLetter(c) || unicode.IsDigit(c) {
alpha++
}
}
return float64(alpha)/float64(len(runes)) < 0.02
}
+37
View File
@@ -0,0 +1,37 @@
package parse
import (
"strings"
"testing"
)
func TestIsGibberish(t *testing.T) {
tests := []struct {
name string
text string
want bool
}{
{"empty", "", true},
{"too short", "hello", true},
{"exactly 50 letters", strings.Repeat("a", 50), false},
{"49 letters", strings.Repeat("a", 49), true},
{"all punctuation", strings.Repeat(".", 100), true},
{"1% alpha", strings.Repeat(".", 99) + "a", true},
{"2% alpha exactly", strings.Repeat(".", 49) + "a" + strings.Repeat(".", 49) + "a", false},
{"normal text", "The quick brown fox jumps over the lazy dog. " + strings.Repeat("word ", 10), false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := isGibberish(tt.text); got != tt.want {
t.Errorf("isGibberish(%q…) = %v, want %v", tt.text[:min(len(tt.text), 20)], got, tt.want)
}
})
}
}
func min(a, b int) int {
if a < b {
return a
}
return b
}