Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 13:03:04 +03:00
parent d9de37d3d8
commit e53e7662e9
13 changed files with 628 additions and 0 deletions
@@ -11,6 +11,7 @@ type Config struct {
 	DataDir       string
 	Port          string
 	AdminUsers    []AdminUser
+	LLMModel      string // defaults to gpt-4o-mini
 }

 type AdminUser struct {
@@ -24,6 +25,7 @@ func Load() *Config {
 		SessionSecret: os.Getenv("SESSION_SECRET"),
 		DataDir:       envOr("DATA_DIR", "./data"),
 		Port:          envOr("PORT", "8080"),
+		LLMModel:      envOr("LLM_MODEL", "gpt-4o-mini"),
 	}
 	cfg.AdminUsers = parseAdminUsers(os.Getenv("ADMIN_USERS"))
 	return cfg
@@ -0,0 +1,119 @@
+package llm
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/json"
+	"fmt"
+
+	openai "github.com/sashabaranov/go-openai"
+)
+
+// ParsedQuestion is a question extracted from a document chunk by the LLM.
+type ParsedQuestion struct {
+	Question string
+	Answers  []ParsedAnswer
+}
+
+// ParsedAnswer is one answer choice for a ParsedQuestion.
+type ParsedAnswer struct {
+	Text    string
+	Correct bool
+}
+
+// ChatClient is the interface for creating chat completions.
+// The concrete *openai.Client satisfies this interface.
+type ChatClient interface {
+	CreateChatCompletion(ctx context.Context, req openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error)
+}
+
+// Client wraps a ChatClient with question-extraction logic.
+type Client struct {
+	cc    ChatClient
+	model string
+}
+
+// New creates a Client backed by the real OpenAI API.
+func New(apiKey, model string) *Client {
+	if model == "" {
+		model = "gpt-4o-mini"
+	}
+	return &Client{cc: openai.NewClient(apiKey), model: model}
+}
+
+// NewWithClient creates a Client with an injected ChatClient (useful for tests).
+func NewWithClient(cc ChatClient, model string) *Client {
+	return &Client{cc: cc, model: model}
+}
+
+const systemPrompt = `You extract multiple-choice questions from study material. Return every question found. Exactly one answer per question must be marked correct. If the source doesn't clearly mark a correct answer, omit that question entirely. Do not invent questions not present in the text.
+
+Respond with JSON matching this schema exactly:
+{"questions":[{"question":"<text>","answers":[{"text":"<text>","correct":false},{"text":"<text>","correct":true}]}]}`
+
+type llmResponse struct {
+	Questions []struct {
+		Question string `json:"question"`
+		Answers  []struct {
+			Text    string `json:"text"`
+			Correct bool   `json:"correct"`
+		} `json:"answers"`
+	} `json:"questions"`
+}
+
+// ExtractQuestions sends chunk to the LLM and returns validated, deduplicated questions.
+// Questions that do not have exactly one correct answer are silently dropped.
+func (c *Client) ExtractQuestions(ctx context.Context, chunk string) ([]ParsedQuestion, error) {
+	resp, err := c.cc.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
+		Model: c.model,
+		Messages: []openai.ChatCompletionMessage{
+			{Role: openai.ChatMessageRoleSystem, Content: systemPrompt},
+			{Role: openai.ChatMessageRoleUser, Content: chunk},
+		},
+		ResponseFormat: &openai.ChatCompletionResponseFormat{
+			Type: openai.ChatCompletionResponseFormatTypeJSONObject,
+		},
+	})
+	if err != nil {
+		return nil, fmt.Errorf("openai: %w", err)
+	}
+	if len(resp.Choices) == 0 {
+		return nil, fmt.Errorf("openai: empty response")
+	}
+
+	var raw llmResponse
+	if err := json.Unmarshal([]byte(resp.Choices[0].Message.Content), &raw); err != nil {
+		return nil, fmt.Errorf("parse llm response: %w", err)
+	}
+
+	seen := make(map[string]bool)
+	var out []ParsedQuestion
+	for _, q := range raw.Questions {
+		var nCorrect int
+		for _, a := range q.Answers {
+			if a.Correct {
+				nCorrect++
+			}
+		}
+		if nCorrect != 1 {
+			continue
+		}
+		key := textHash(q.Question)
+		if seen[key] {
+			continue
+		}
+		seen[key] = true
+
+		pq := ParsedQuestion{Question: q.Question}
+		for _, a := range q.Answers {
+			pq.Answers = append(pq.Answers, ParsedAnswer{Text: a.Text, Correct: a.Correct})
+		}
+		out = append(out, pq)
+	}
+	return out, nil
+}
+
+func textHash(s string) string {
+	h := sha256.Sum256([]byte(s))
+	return fmt.Sprintf("%x", h[:8])
+}
@@ -0,0 +1,129 @@
+package llm_test
+
+import (
+	"context"
+	"encoding/json"
+	"testing"
+
+	openai "github.com/sashabaranov/go-openai"
+
+	"qbank/internal/llm"
+)
+
+// mockChat implements llm.ChatClient for testing.
+type mockChat struct{ body string }
+
+func (m *mockChat) CreateChatCompletion(_ context.Context, _ openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error) {
+	return openai.ChatCompletionResponse{
+		Choices: []openai.ChatCompletionChoice{
+			{Message: openai.ChatCompletionMessage{Content: m.body}},
+		},
+	}, nil
+}
+
+func mockClient(t *testing.T, questions []map[string]any) *llm.Client {
+	t.Helper()
+	body, err := json.Marshal(map[string]any{"questions": questions})
+	if err != nil {
+		t.Fatal(err)
+	}
+	return llm.NewWithClient(&mockChat{body: string(body)}, "test-model")
+}
+
+func TestExtractQuestions_HappyPath(t *testing.T) {
+	qs, err := mockClient(t, []map[string]any{
+		{
+			"question": "What is 2+2?",
+			"answers": []map[string]any{
+				{"text": "3", "correct": false},
+				{"text": "4", "correct": true},
+				{"text": "5", "correct": false},
+			},
+		},
+	}).ExtractQuestions(context.Background(), "text")
+
+	if err != nil {
+		t.Fatalf("ExtractQuestions: %v", err)
+	}
+	if len(qs) != 1 {
+		t.Fatalf("want 1 question, got %d", len(qs))
+	}
+	if qs[0].Question != "What is 2+2?" {
+		t.Errorf("wrong question text: %q", qs[0].Question)
+	}
+	if len(qs[0].Answers) != 3 {
+		t.Errorf("want 3 answers, got %d", len(qs[0].Answers))
+	}
+}
+
+func TestExtractQuestions_DropsInvalid(t *testing.T) {
+	qs, err := mockClient(t, []map[string]any{
+		{
+			"question": "Two correct — should drop",
+			"answers": []map[string]any{
+				{"text": "A", "correct": true},
+				{"text": "B", "correct": true},
+			},
+		},
+		{
+			"question": "Zero correct — should drop",
+			"answers": []map[string]any{
+				{"text": "A", "correct": false},
+				{"text": "B", "correct": false},
+			},
+		},
+		{
+			"question": "Valid question",
+			"answers": []map[string]any{
+				{"text": "Wrong", "correct": false},
+				{"text": "Right", "correct": true},
+			},
+		},
+	}).ExtractQuestions(context.Background(), "text")
+
+	if err != nil {
+		t.Fatalf("ExtractQuestions: %v", err)
+	}
+	if len(qs) != 1 {
+		t.Fatalf("want 1 question after dropping invalid, got %d", len(qs))
+	}
+	if qs[0].Question != "Valid question" {
+		t.Errorf("wrong question kept: %q", qs[0].Question)
+	}
+}
+
+func TestExtractQuestions_Dedup(t *testing.T) {
+	qs, err := mockClient(t, []map[string]any{
+		{
+			"question": "Duplicate?",
+			"answers": []map[string]any{
+				{"text": "Yes", "correct": true},
+				{"text": "No", "correct": false},
+			},
+		},
+		{
+			"question": "Duplicate?",
+			"answers": []map[string]any{
+				{"text": "Yes", "correct": true},
+				{"text": "No", "correct": false},
+			},
+		},
+	}).ExtractQuestions(context.Background(), "text")
+
+	if err != nil {
+		t.Fatalf("ExtractQuestions: %v", err)
+	}
+	if len(qs) != 1 {
+		t.Errorf("want 1 unique question after dedup, got %d", len(qs))
+	}
+}
+
+func TestExtractQuestions_EmptyResponse(t *testing.T) {
+	qs, err := mockClient(t, []map[string]any{}).ExtractQuestions(context.Background(), "text")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(qs) != 0 {
+		t.Errorf("want 0 questions for empty response, got %d", len(qs))
+	}
+}
@@ -0,0 +1,53 @@
+package parse_test
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+
+	"qbank/internal/parse"
+)
+
+// TestAcceptanceDOCXPipeline verifies the full DOCX → text → chunk pipeline
+// using a handcrafted in-memory docx with known content.
+func TestAcceptanceDOCXPipeline(t *testing.T) {
+	const docXML = `<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p><w:r><w:t>1. Which keyword declares a variable in Go?</w:t></w:r></w:p>
+    <w:p><w:r><w:t>A) var</w:t></w:r></w:p>
+    <w:p><w:r><w:t>B) let</w:t></w:r></w:p>
+    <w:p><w:r><w:t>C) dim</w:t></w:r></w:p>
+    <w:p><w:r><w:t>Correct: A</w:t></w:r></w:p>
+    <w:p><w:r><w:t>2. What does fmt.Println return?</w:t></w:r></w:p>
+    <w:p><w:r><w:t>A) Nothing</w:t></w:r></w:p>
+    <w:p><w:r><w:t>B) n int, err error</w:t></w:r></w:p>
+    <w:p><w:r><w:t>Correct: B</w:t></w:r></w:p>
+  </w:body>
+</w:document>`
+
+	docx := buildDocx(t, docXML)
+
+	text, err := parse.ExtractDOCX(bytes.NewReader(docx))
+	if err != nil {
+		t.Fatalf("ExtractDOCX: %v", err)
+	}
+
+	wantPhrases := []string{
+		"Which keyword declares a variable",
+		"fmt.Println",
+		"n int, err error",
+	}
+	for _, phrase := range wantPhrases {
+		if !strings.Contains(text, phrase) {
+			t.Errorf("text missing %q\nfull text:\n%s", phrase, text)
+		}
+	}
+
+	// Chunking should produce at least 1 chunk.
+	chunks := parse.Chunk(text, 10_000)
+	if len(chunks) == 0 {
+		t.Error("Chunk returned 0 chunks for non-empty text")
+	}
+	t.Logf("extracted %d chars, %d chunk(s)", len(text), len(chunks))
+}
@@ -0,0 +1,31 @@
+package parse
+
+import "strings"
+
+// Chunk splits text on double-newlines and builds chunks of at most maxRunes.
+// A single paragraph longer than maxRunes is kept as its own chunk.
+func Chunk(text string, maxRunes int) []string {
+	paragraphs := strings.Split(text, "\n\n")
+	var chunks []string
+	var cur strings.Builder
+
+	for _, p := range paragraphs {
+		p = strings.TrimSpace(p)
+		if p == "" {
+			continue
+		}
+		pLen := len([]rune(p))
+		if cur.Len() > 0 && len([]rune(cur.String()))+2+pLen > maxRunes {
+			chunks = append(chunks, cur.String())
+			cur.Reset()
+		}
+		if cur.Len() > 0 {
+			cur.WriteString("\n\n")
+		}
+		cur.WriteString(p)
+	}
+	if cur.Len() > 0 {
+		chunks = append(chunks, cur.String())
+	}
+	return chunks
+}
@@ -0,0 +1,52 @@
+package parse_test
+
+import (
+	"strings"
+	"testing"
+
+	"qbank/internal/parse"
+)
+
+func TestChunk(t *testing.T) {
+	t.Run("small text stays in one chunk", func(t *testing.T) {
+		text := "Para one.\n\nPara two.\n\nPara three."
+		chunks := parse.Chunk(text, 1000)
+		if len(chunks) != 1 {
+			t.Errorf("want 1 chunk, got %d: %v", len(chunks), chunks)
+		}
+		if !strings.Contains(chunks[0], "Para one") || !strings.Contains(chunks[0], "Para three") {
+			t.Errorf("content lost: %q", chunks[0])
+		}
+	})
+
+	t.Run("paragraphs split when over limit", func(t *testing.T) {
+		para := strings.Repeat("x", 600)
+		text := para + "\n\n" + para + "\n\n" + para
+		chunks := parse.Chunk(text, 1000)
+		if len(chunks) < 2 {
+			t.Errorf("want ≥2 chunks for 1800-rune input with 1000 limit, got %d", len(chunks))
+		}
+		// No chunk should combine paragraphs past the limit
+		for i, c := range chunks {
+			if len([]rune(c)) > 1200 {
+				t.Errorf("chunk %d is %d runes, too large", i, len([]rune(c)))
+			}
+		}
+	})
+
+	t.Run("single oversized paragraph kept as own chunk", func(t *testing.T) {
+		bigPara := strings.Repeat("x", 2000)
+		chunks := parse.Chunk(bigPara, 1000)
+		if len(chunks) != 1 {
+			t.Errorf("want 1 chunk for single oversized para, got %d", len(chunks))
+		}
+	})
+
+	t.Run("empty paragraphs ignored", func(t *testing.T) {
+		text := "\n\nPara one.\n\n\n\nPara two.\n\n"
+		chunks := parse.Chunk(text, 1000)
+		if len(chunks) != 1 {
+			t.Errorf("want 1 chunk after ignoring blanks, got %d", len(chunks))
+		}
+	})
+}
@@ -0,0 +1,74 @@
+package parse
+
+import (
+	"archive/zip"
+	"bytes"
+	"encoding/xml"
+	"errors"
+	"fmt"
+	"io"
+	"strings"
+)
+
+// ExtractDOCX reads a .docx file and returns its plain text.
+// DOCX is a ZIP archive; we unzip word/document.xml, walk <w:t> nodes
+// for text, and emit a newline at each <w:p> boundary.
+func ExtractDOCX(r io.Reader) (string, error) {
+	data, err := io.ReadAll(r)
+	if err != nil {
+		return "", fmt.Errorf("read docx: %w", err)
+	}
+	zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
+	if err != nil {
+		return "", fmt.Errorf("open docx zip: %w", err)
+	}
+	var docFile *zip.File
+	for _, f := range zr.File {
+		if f.Name == "word/document.xml" {
+			docFile = f
+			break
+		}
+	}
+	if docFile == nil {
+		return "", errors.New("word/document.xml not found in docx")
+	}
+	rc, err := docFile.Open()
+	if err != nil {
+		return "", fmt.Errorf("open document.xml: %w", err)
+	}
+	defer rc.Close()
+	return parseDocXML(rc)
+}
+
+func parseDocXML(r io.Reader) (string, error) {
+	dec := xml.NewDecoder(r)
+	var sb strings.Builder
+	var inText bool
+	for {
+		tok, err := dec.Token()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return "", fmt.Errorf("parse document.xml: %w", err)
+		}
+		switch t := tok.(type) {
+		case xml.StartElement:
+			if t.Name.Local == "t" {
+				inText = true
+			}
+		case xml.EndElement:
+			if t.Name.Local == "t" {
+				inText = false
+			}
+			if t.Name.Local == "p" {
+				sb.WriteByte('\n')
+			}
+		case xml.CharData:
+			if inText {
+				sb.Write([]byte(t))
+			}
+		}
+	}
+	return strings.TrimSpace(sb.String()), nil
+}
@@ -0,0 +1,62 @@
+package parse_test
+
+import (
+	"archive/zip"
+	"bytes"
+	"strings"
+	"testing"
+
+	"qbank/internal/parse"
+)
+
+func TestExtractDOCX(t *testing.T) {
+	const docXML = `<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p><w:r><w:t>Question 1: What is Go?</w:t></w:r></w:p>
+    <w:p><w:r><w:t>A) A compiled language</w:t></w:r></w:p>
+    <w:p><w:r><w:t>B) An interpreted language</w:t></w:r></w:p>
+    <w:p><w:r><w:t>C) A markup language</w:t></w:r></w:p>
+  </w:body>
+</w:document>`
+
+	docx := buildDocx(t, docXML)
+
+	text, err := parse.ExtractDOCX(bytes.NewReader(docx))
+	if err != nil {
+		t.Fatalf("ExtractDOCX: %v", err)
+	}
+	for _, want := range []string{"Question 1", "compiled language", "interpreted language"} {
+		if !strings.Contains(text, want) {
+			t.Errorf("output missing %q; got:\n%s", want, text)
+		}
+	}
+}
+
+func TestExtractDOCX_MissingXML(t *testing.T) {
+	var buf bytes.Buffer
+	w := zip.NewWriter(&buf)
+	w.Close()
+
+	_, err := parse.ExtractDOCX(bytes.NewReader(buf.Bytes()))
+	if err == nil {
+		t.Error("expected error for docx without document.xml")
+	}
+}
+
+func buildDocx(t *testing.T, xmlContent string) []byte {
+	t.Helper()
+	var buf bytes.Buffer
+	w := zip.NewWriter(&buf)
+	f, err := w.Create("word/document.xml")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err := f.Write([]byte(xmlContent)); err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+	return buf.Bytes()
+}
@@ -0,0 +1,62 @@
+package parse
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"strings"
+	"unicode"
+
+	"github.com/ledongthuc/pdf"
+)
+
+// ErrScanPDF is returned when extracted text is empty or non-textual,
+// indicating a scan-based (image-only) PDF that cannot be parsed.
+var ErrScanPDF = errors.New("scan-based PDF: please convert to text first")
+
+// ExtractPDF reads a PDF and returns its concatenated plain text.
+// Returns ErrScanPDF if the content appears to be empty or non-textual.
+func ExtractPDF(r io.Reader) (string, error) {
+	data, err := io.ReadAll(r)
+	if err != nil {
+		return "", fmt.Errorf("read pdf: %w", err)
+	}
+	reader, err := pdf.NewReader(bytes.NewReader(data), int64(len(data)))
+	if err != nil {
+		return "", fmt.Errorf("parse pdf: %w", err)
+	}
+	var sb strings.Builder
+	for i := 1; i <= reader.NumPage(); i++ {
+		page := reader.Page(i)
+		if page.V.IsNull() {
+			continue
+		}
+		text, err := page.GetPlainText(nil)
+		if err != nil {
+			continue
+		}
+		sb.WriteString(text)
+		sb.WriteByte('\n')
+	}
+	text := sb.String()
+	if isGibberish(text) {
+		return "", ErrScanPDF
+	}
+	return text, nil
+}
+
+// isGibberish returns true when text is too short or has < 2% alphanumeric content.
+func isGibberish(text string) bool {
+	runes := []rune(text)
+	if len(runes) < 50 {
+		return true
+	}
+	var alpha int
+	for _, c := range runes {
+		if unicode.IsLetter(c) || unicode.IsDigit(c) {
+			alpha++
+		}
+	}
+	return float64(alpha)/float64(len(runes)) < 0.02
+}
@@ -0,0 +1,37 @@
+package parse
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestIsGibberish(t *testing.T) {
+	tests := []struct {
+		name string
+		text string
+		want bool
+	}{
+		{"empty", "", true},
+		{"too short", "hello", true},
+		{"exactly 50 letters", strings.Repeat("a", 50), false},
+		{"49 letters", strings.Repeat("a", 49), true},
+		{"all punctuation", strings.Repeat(".", 100), true},
+		{"1% alpha", strings.Repeat(".", 99) + "a", true},
+		{"2% alpha exactly", strings.Repeat(".", 49) + "a" + strings.Repeat(".", 49) + "a", false},
+		{"normal text", "The quick brown fox jumps over the lazy dog. " + strings.Repeat("word ", 10), false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isGibberish(tt.text); got != tt.want {
+				t.Errorf("isGibberish(%q…) = %v, want %v", tt.text[:min(len(tt.text), 20)], got, tt.want)
+			}
+		})
+	}
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}