Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,53 @@
|
||||
package parse_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"qbank/internal/parse"
|
||||
)
|
||||
|
||||
// TestAcceptanceDOCXPipeline verifies the full DOCX → text → chunk pipeline
|
||||
// using a handcrafted in-memory docx with known content.
|
||||
func TestAcceptanceDOCXPipeline(t *testing.T) {
|
||||
const docXML = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p><w:r><w:t>1. Which keyword declares a variable in Go?</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>A) var</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>B) let</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>C) dim</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>Correct: A</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>2. What does fmt.Println return?</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>A) Nothing</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>B) n int, err error</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>Correct: B</w:t></w:r></w:p>
|
||||
</w:body>
|
||||
</w:document>`
|
||||
|
||||
docx := buildDocx(t, docXML)
|
||||
|
||||
text, err := parse.ExtractDOCX(bytes.NewReader(docx))
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractDOCX: %v", err)
|
||||
}
|
||||
|
||||
wantPhrases := []string{
|
||||
"Which keyword declares a variable",
|
||||
"fmt.Println",
|
||||
"n int, err error",
|
||||
}
|
||||
for _, phrase := range wantPhrases {
|
||||
if !strings.Contains(text, phrase) {
|
||||
t.Errorf("text missing %q\nfull text:\n%s", phrase, text)
|
||||
}
|
||||
}
|
||||
|
||||
// Chunking should produce at least 1 chunk.
|
||||
chunks := parse.Chunk(text, 10_000)
|
||||
if len(chunks) == 0 {
|
||||
t.Error("Chunk returned 0 chunks for non-empty text")
|
||||
}
|
||||
t.Logf("extracted %d chars, %d chunk(s)", len(text), len(chunks))
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
package parse
|
||||
|
||||
import "strings"
|
||||
|
||||
// Chunk splits text on double-newlines and builds chunks of at most maxRunes.
|
||||
// A single paragraph longer than maxRunes is kept as its own chunk.
|
||||
func Chunk(text string, maxRunes int) []string {
|
||||
paragraphs := strings.Split(text, "\n\n")
|
||||
var chunks []string
|
||||
var cur strings.Builder
|
||||
|
||||
for _, p := range paragraphs {
|
||||
p = strings.TrimSpace(p)
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
pLen := len([]rune(p))
|
||||
if cur.Len() > 0 && len([]rune(cur.String()))+2+pLen > maxRunes {
|
||||
chunks = append(chunks, cur.String())
|
||||
cur.Reset()
|
||||
}
|
||||
if cur.Len() > 0 {
|
||||
cur.WriteString("\n\n")
|
||||
}
|
||||
cur.WriteString(p)
|
||||
}
|
||||
if cur.Len() > 0 {
|
||||
chunks = append(chunks, cur.String())
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
package parse_test
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"qbank/internal/parse"
|
||||
)
|
||||
|
||||
func TestChunk(t *testing.T) {
|
||||
t.Run("small text stays in one chunk", func(t *testing.T) {
|
||||
text := "Para one.\n\nPara two.\n\nPara three."
|
||||
chunks := parse.Chunk(text, 1000)
|
||||
if len(chunks) != 1 {
|
||||
t.Errorf("want 1 chunk, got %d: %v", len(chunks), chunks)
|
||||
}
|
||||
if !strings.Contains(chunks[0], "Para one") || !strings.Contains(chunks[0], "Para three") {
|
||||
t.Errorf("content lost: %q", chunks[0])
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("paragraphs split when over limit", func(t *testing.T) {
|
||||
para := strings.Repeat("x", 600)
|
||||
text := para + "\n\n" + para + "\n\n" + para
|
||||
chunks := parse.Chunk(text, 1000)
|
||||
if len(chunks) < 2 {
|
||||
t.Errorf("want ≥2 chunks for 1800-rune input with 1000 limit, got %d", len(chunks))
|
||||
}
|
||||
// No chunk should combine paragraphs past the limit
|
||||
for i, c := range chunks {
|
||||
if len([]rune(c)) > 1200 {
|
||||
t.Errorf("chunk %d is %d runes, too large", i, len([]rune(c)))
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("single oversized paragraph kept as own chunk", func(t *testing.T) {
|
||||
bigPara := strings.Repeat("x", 2000)
|
||||
chunks := parse.Chunk(bigPara, 1000)
|
||||
if len(chunks) != 1 {
|
||||
t.Errorf("want 1 chunk for single oversized para, got %d", len(chunks))
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("empty paragraphs ignored", func(t *testing.T) {
|
||||
text := "\n\nPara one.\n\n\n\nPara two.\n\n"
|
||||
chunks := parse.Chunk(text, 1000)
|
||||
if len(chunks) != 1 {
|
||||
t.Errorf("want 1 chunk after ignoring blanks, got %d", len(chunks))
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
package parse
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ExtractDOCX reads a .docx file and returns its plain text.
|
||||
// DOCX is a ZIP archive; we unzip word/document.xml, walk <w:t> nodes
|
||||
// for text, and emit a newline at each <w:p> boundary.
|
||||
func ExtractDOCX(r io.Reader) (string, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read docx: %w", err)
|
||||
}
|
||||
zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("open docx zip: %w", err)
|
||||
}
|
||||
var docFile *zip.File
|
||||
for _, f := range zr.File {
|
||||
if f.Name == "word/document.xml" {
|
||||
docFile = f
|
||||
break
|
||||
}
|
||||
}
|
||||
if docFile == nil {
|
||||
return "", errors.New("word/document.xml not found in docx")
|
||||
}
|
||||
rc, err := docFile.Open()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("open document.xml: %w", err)
|
||||
}
|
||||
defer rc.Close()
|
||||
return parseDocXML(rc)
|
||||
}
|
||||
|
||||
func parseDocXML(r io.Reader) (string, error) {
|
||||
dec := xml.NewDecoder(r)
|
||||
var sb strings.Builder
|
||||
var inText bool
|
||||
for {
|
||||
tok, err := dec.Token()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse document.xml: %w", err)
|
||||
}
|
||||
switch t := tok.(type) {
|
||||
case xml.StartElement:
|
||||
if t.Name.Local == "t" {
|
||||
inText = true
|
||||
}
|
||||
case xml.EndElement:
|
||||
if t.Name.Local == "t" {
|
||||
inText = false
|
||||
}
|
||||
if t.Name.Local == "p" {
|
||||
sb.WriteByte('\n')
|
||||
}
|
||||
case xml.CharData:
|
||||
if inText {
|
||||
sb.Write([]byte(t))
|
||||
}
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(sb.String()), nil
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
package parse_test
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"qbank/internal/parse"
|
||||
)
|
||||
|
||||
func TestExtractDOCX(t *testing.T) {
|
||||
const docXML = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p><w:r><w:t>Question 1: What is Go?</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>A) A compiled language</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>B) An interpreted language</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>C) A markup language</w:t></w:r></w:p>
|
||||
</w:body>
|
||||
</w:document>`
|
||||
|
||||
docx := buildDocx(t, docXML)
|
||||
|
||||
text, err := parse.ExtractDOCX(bytes.NewReader(docx))
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractDOCX: %v", err)
|
||||
}
|
||||
for _, want := range []string{"Question 1", "compiled language", "interpreted language"} {
|
||||
if !strings.Contains(text, want) {
|
||||
t.Errorf("output missing %q; got:\n%s", want, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractDOCX_MissingXML(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
w := zip.NewWriter(&buf)
|
||||
w.Close()
|
||||
|
||||
_, err := parse.ExtractDOCX(bytes.NewReader(buf.Bytes()))
|
||||
if err == nil {
|
||||
t.Error("expected error for docx without document.xml")
|
||||
}
|
||||
}
|
||||
|
||||
func buildDocx(t *testing.T, xmlContent string) []byte {
|
||||
t.Helper()
|
||||
var buf bytes.Buffer
|
||||
w := zip.NewWriter(&buf)
|
||||
f, err := w.Create("word/document.xml")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := f.Write([]byte(xmlContent)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
package parse
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/ledongthuc/pdf"
|
||||
)
|
||||
|
||||
// ErrScanPDF is returned when extracted text is empty or non-textual,
|
||||
// indicating a scan-based (image-only) PDF that cannot be parsed.
|
||||
var ErrScanPDF = errors.New("scan-based PDF: please convert to text first")
|
||||
|
||||
// ExtractPDF reads a PDF and returns its concatenated plain text.
|
||||
// Returns ErrScanPDF if the content appears to be empty or non-textual.
|
||||
func ExtractPDF(r io.Reader) (string, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read pdf: %w", err)
|
||||
}
|
||||
reader, err := pdf.NewReader(bytes.NewReader(data), int64(len(data)))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse pdf: %w", err)
|
||||
}
|
||||
var sb strings.Builder
|
||||
for i := 1; i <= reader.NumPage(); i++ {
|
||||
page := reader.Page(i)
|
||||
if page.V.IsNull() {
|
||||
continue
|
||||
}
|
||||
text, err := page.GetPlainText(nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sb.WriteString(text)
|
||||
sb.WriteByte('\n')
|
||||
}
|
||||
text := sb.String()
|
||||
if isGibberish(text) {
|
||||
return "", ErrScanPDF
|
||||
}
|
||||
return text, nil
|
||||
}
|
||||
|
||||
// isGibberish returns true when text is too short or has < 2% alphanumeric content.
|
||||
func isGibberish(text string) bool {
|
||||
runes := []rune(text)
|
||||
if len(runes) < 50 {
|
||||
return true
|
||||
}
|
||||
var alpha int
|
||||
for _, c := range runes {
|
||||
if unicode.IsLetter(c) || unicode.IsDigit(c) {
|
||||
alpha++
|
||||
}
|
||||
}
|
||||
return float64(alpha)/float64(len(runes)) < 0.02
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
package parse
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestIsGibberish(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
text string
|
||||
want bool
|
||||
}{
|
||||
{"empty", "", true},
|
||||
{"too short", "hello", true},
|
||||
{"exactly 50 letters", strings.Repeat("a", 50), false},
|
||||
{"49 letters", strings.Repeat("a", 49), true},
|
||||
{"all punctuation", strings.Repeat(".", 100), true},
|
||||
{"1% alpha", strings.Repeat(".", 99) + "a", true},
|
||||
{"2% alpha exactly", strings.Repeat(".", 49) + "a" + strings.Repeat(".", 49) + "a", false},
|
||||
{"normal text", "The quick brown fox jumps over the lazy dog. " + strings.Repeat("word ", 10), false},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := isGibberish(tt.text); got != tt.want {
|
||||
t.Errorf("isGibberish(%q…) = %v, want %v", tt.text[:min(len(tt.text), 20)], got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
Reference in New Issue
Block a user