Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,3 +3,4 @@ SESSION_SECRET=change-me-to-a-random-32-char-string
|
|||||||
DATA_DIR=./data
|
DATA_DIR=./data
|
||||||
PORT=8080
|
PORT=8080
|
||||||
ADMIN_USERS=alice:password1,bob:password2
|
ADMIN_USERS=alice:password1,bob:password2
|
||||||
|
LLM_MODEL=gpt-4o-mini
|
||||||
|
|||||||
@@ -8,9 +8,11 @@ require (
|
|||||||
github.com/alexedwards/scs/v2 v2.9.0 // indirect
|
github.com/alexedwards/scs/v2 v2.9.0 // indirect
|
||||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
|
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 // indirect
|
||||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||||
|
github.com/sashabaranov/go-openai v1.41.2 // indirect
|
||||||
golang.org/x/crypto v0.51.0 // indirect
|
golang.org/x/crypto v0.51.0 // indirect
|
||||||
golang.org/x/sys v0.44.0 // indirect
|
golang.org/x/sys v0.44.0 // indirect
|
||||||
modernc.org/libc v1.72.0 // indirect
|
modernc.org/libc v1.72.0 // indirect
|
||||||
|
|||||||
@@ -6,12 +6,16 @@ github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug=
|
|||||||
github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0=
|
github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0=
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+ExRDqGQltzXqN/xypdKP86niVn8=
|
||||||
|
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=
|
||||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||||
|
github.com/sashabaranov/go-openai v1.41.2 h1:vfPRBZNMpnqu8ELsclWcAvF19lDNgh1t6TVfFFOPiSM=
|
||||||
|
github.com/sashabaranov/go-openai v1.41.2/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
|
||||||
golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
|
golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
|
||||||
golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
|
golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
|
||||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ type Config struct {
|
|||||||
DataDir string
|
DataDir string
|
||||||
Port string
|
Port string
|
||||||
AdminUsers []AdminUser
|
AdminUsers []AdminUser
|
||||||
|
LLMModel string // defaults to gpt-4o-mini
|
||||||
}
|
}
|
||||||
|
|
||||||
type AdminUser struct {
|
type AdminUser struct {
|
||||||
@@ -24,6 +25,7 @@ func Load() *Config {
|
|||||||
SessionSecret: os.Getenv("SESSION_SECRET"),
|
SessionSecret: os.Getenv("SESSION_SECRET"),
|
||||||
DataDir: envOr("DATA_DIR", "./data"),
|
DataDir: envOr("DATA_DIR", "./data"),
|
||||||
Port: envOr("PORT", "8080"),
|
Port: envOr("PORT", "8080"),
|
||||||
|
LLMModel: envOr("LLM_MODEL", "gpt-4o-mini"),
|
||||||
}
|
}
|
||||||
cfg.AdminUsers = parseAdminUsers(os.Getenv("ADMIN_USERS"))
|
cfg.AdminUsers = parseAdminUsers(os.Getenv("ADMIN_USERS"))
|
||||||
return cfg
|
return cfg
|
||||||
|
|||||||
@@ -0,0 +1,119 @@
|
|||||||
|
package llm
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"crypto/sha256"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
openai "github.com/sashabaranov/go-openai"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParsedQuestion is a question extracted from a document chunk by the LLM.
|
||||||
|
type ParsedQuestion struct {
|
||||||
|
Question string
|
||||||
|
Answers []ParsedAnswer
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParsedAnswer is one answer choice for a ParsedQuestion.
|
||||||
|
type ParsedAnswer struct {
|
||||||
|
Text string
|
||||||
|
Correct bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// ChatClient is the interface for creating chat completions.
|
||||||
|
// The concrete *openai.Client satisfies this interface.
|
||||||
|
type ChatClient interface {
|
||||||
|
CreateChatCompletion(ctx context.Context, req openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Client wraps a ChatClient with question-extraction logic.
|
||||||
|
type Client struct {
|
||||||
|
cc ChatClient
|
||||||
|
model string
|
||||||
|
}
|
||||||
|
|
||||||
|
// New creates a Client backed by the real OpenAI API.
|
||||||
|
func New(apiKey, model string) *Client {
|
||||||
|
if model == "" {
|
||||||
|
model = "gpt-4o-mini"
|
||||||
|
}
|
||||||
|
return &Client{cc: openai.NewClient(apiKey), model: model}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewWithClient creates a Client with an injected ChatClient (useful for tests).
|
||||||
|
func NewWithClient(cc ChatClient, model string) *Client {
|
||||||
|
return &Client{cc: cc, model: model}
|
||||||
|
}
|
||||||
|
|
||||||
|
const systemPrompt = `You extract multiple-choice questions from study material. Return every question found. Exactly one answer per question must be marked correct. If the source doesn't clearly mark a correct answer, omit that question entirely. Do not invent questions not present in the text.
|
||||||
|
|
||||||
|
Respond with JSON matching this schema exactly:
|
||||||
|
{"questions":[{"question":"<text>","answers":[{"text":"<text>","correct":false},{"text":"<text>","correct":true}]}]}`
|
||||||
|
|
||||||
|
type llmResponse struct {
|
||||||
|
Questions []struct {
|
||||||
|
Question string `json:"question"`
|
||||||
|
Answers []struct {
|
||||||
|
Text string `json:"text"`
|
||||||
|
Correct bool `json:"correct"`
|
||||||
|
} `json:"answers"`
|
||||||
|
} `json:"questions"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExtractQuestions sends chunk to the LLM and returns validated, deduplicated questions.
|
||||||
|
// Questions that do not have exactly one correct answer are silently dropped.
|
||||||
|
func (c *Client) ExtractQuestions(ctx context.Context, chunk string) ([]ParsedQuestion, error) {
|
||||||
|
resp, err := c.cc.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
|
||||||
|
Model: c.model,
|
||||||
|
Messages: []openai.ChatCompletionMessage{
|
||||||
|
{Role: openai.ChatMessageRoleSystem, Content: systemPrompt},
|
||||||
|
{Role: openai.ChatMessageRoleUser, Content: chunk},
|
||||||
|
},
|
||||||
|
ResponseFormat: &openai.ChatCompletionResponseFormat{
|
||||||
|
Type: openai.ChatCompletionResponseFormatTypeJSONObject,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("openai: %w", err)
|
||||||
|
}
|
||||||
|
if len(resp.Choices) == 0 {
|
||||||
|
return nil, fmt.Errorf("openai: empty response")
|
||||||
|
}
|
||||||
|
|
||||||
|
var raw llmResponse
|
||||||
|
if err := json.Unmarshal([]byte(resp.Choices[0].Message.Content), &raw); err != nil {
|
||||||
|
return nil, fmt.Errorf("parse llm response: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
seen := make(map[string]bool)
|
||||||
|
var out []ParsedQuestion
|
||||||
|
for _, q := range raw.Questions {
|
||||||
|
var nCorrect int
|
||||||
|
for _, a := range q.Answers {
|
||||||
|
if a.Correct {
|
||||||
|
nCorrect++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if nCorrect != 1 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := textHash(q.Question)
|
||||||
|
if seen[key] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[key] = true
|
||||||
|
|
||||||
|
pq := ParsedQuestion{Question: q.Question}
|
||||||
|
for _, a := range q.Answers {
|
||||||
|
pq.Answers = append(pq.Answers, ParsedAnswer{Text: a.Text, Correct: a.Correct})
|
||||||
|
}
|
||||||
|
out = append(out, pq)
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func textHash(s string) string {
|
||||||
|
h := sha256.Sum256([]byte(s))
|
||||||
|
return fmt.Sprintf("%x", h[:8])
|
||||||
|
}
|
||||||
@@ -0,0 +1,129 @@
|
|||||||
|
package llm_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
openai "github.com/sashabaranov/go-openai"
|
||||||
|
|
||||||
|
"qbank/internal/llm"
|
||||||
|
)
|
||||||
|
|
||||||
|
// mockChat implements llm.ChatClient for testing.
|
||||||
|
type mockChat struct{ body string }
|
||||||
|
|
||||||
|
func (m *mockChat) CreateChatCompletion(_ context.Context, _ openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error) {
|
||||||
|
return openai.ChatCompletionResponse{
|
||||||
|
Choices: []openai.ChatCompletionChoice{
|
||||||
|
{Message: openai.ChatCompletionMessage{Content: m.body}},
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func mockClient(t *testing.T, questions []map[string]any) *llm.Client {
|
||||||
|
t.Helper()
|
||||||
|
body, err := json.Marshal(map[string]any{"questions": questions})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
return llm.NewWithClient(&mockChat{body: string(body)}, "test-model")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractQuestions_HappyPath(t *testing.T) {
|
||||||
|
qs, err := mockClient(t, []map[string]any{
|
||||||
|
{
|
||||||
|
"question": "What is 2+2?",
|
||||||
|
"answers": []map[string]any{
|
||||||
|
{"text": "3", "correct": false},
|
||||||
|
{"text": "4", "correct": true},
|
||||||
|
{"text": "5", "correct": false},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).ExtractQuestions(context.Background(), "text")
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExtractQuestions: %v", err)
|
||||||
|
}
|
||||||
|
if len(qs) != 1 {
|
||||||
|
t.Fatalf("want 1 question, got %d", len(qs))
|
||||||
|
}
|
||||||
|
if qs[0].Question != "What is 2+2?" {
|
||||||
|
t.Errorf("wrong question text: %q", qs[0].Question)
|
||||||
|
}
|
||||||
|
if len(qs[0].Answers) != 3 {
|
||||||
|
t.Errorf("want 3 answers, got %d", len(qs[0].Answers))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractQuestions_DropsInvalid(t *testing.T) {
|
||||||
|
qs, err := mockClient(t, []map[string]any{
|
||||||
|
{
|
||||||
|
"question": "Two correct — should drop",
|
||||||
|
"answers": []map[string]any{
|
||||||
|
{"text": "A", "correct": true},
|
||||||
|
{"text": "B", "correct": true},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "Zero correct — should drop",
|
||||||
|
"answers": []map[string]any{
|
||||||
|
{"text": "A", "correct": false},
|
||||||
|
{"text": "B", "correct": false},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "Valid question",
|
||||||
|
"answers": []map[string]any{
|
||||||
|
{"text": "Wrong", "correct": false},
|
||||||
|
{"text": "Right", "correct": true},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).ExtractQuestions(context.Background(), "text")
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExtractQuestions: %v", err)
|
||||||
|
}
|
||||||
|
if len(qs) != 1 {
|
||||||
|
t.Fatalf("want 1 question after dropping invalid, got %d", len(qs))
|
||||||
|
}
|
||||||
|
if qs[0].Question != "Valid question" {
|
||||||
|
t.Errorf("wrong question kept: %q", qs[0].Question)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractQuestions_Dedup(t *testing.T) {
|
||||||
|
qs, err := mockClient(t, []map[string]any{
|
||||||
|
{
|
||||||
|
"question": "Duplicate?",
|
||||||
|
"answers": []map[string]any{
|
||||||
|
{"text": "Yes", "correct": true},
|
||||||
|
{"text": "No", "correct": false},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "Duplicate?",
|
||||||
|
"answers": []map[string]any{
|
||||||
|
{"text": "Yes", "correct": true},
|
||||||
|
{"text": "No", "correct": false},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).ExtractQuestions(context.Background(), "text")
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExtractQuestions: %v", err)
|
||||||
|
}
|
||||||
|
if len(qs) != 1 {
|
||||||
|
t.Errorf("want 1 unique question after dedup, got %d", len(qs))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractQuestions_EmptyResponse(t *testing.T) {
|
||||||
|
qs, err := mockClient(t, []map[string]any{}).ExtractQuestions(context.Background(), "text")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if len(qs) != 0 {
|
||||||
|
t.Errorf("want 0 questions for empty response, got %d", len(qs))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,53 @@
|
|||||||
|
package parse_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"qbank/internal/parse"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestAcceptanceDOCXPipeline verifies the full DOCX → text → chunk pipeline
|
||||||
|
// using a handcrafted in-memory docx with known content.
|
||||||
|
func TestAcceptanceDOCXPipeline(t *testing.T) {
|
||||||
|
const docXML = `<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||||
|
<w:body>
|
||||||
|
<w:p><w:r><w:t>1. Which keyword declares a variable in Go?</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>A) var</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>B) let</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>C) dim</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>Correct: A</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>2. What does fmt.Println return?</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>A) Nothing</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>B) n int, err error</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>Correct: B</w:t></w:r></w:p>
|
||||||
|
</w:body>
|
||||||
|
</w:document>`
|
||||||
|
|
||||||
|
docx := buildDocx(t, docXML)
|
||||||
|
|
||||||
|
text, err := parse.ExtractDOCX(bytes.NewReader(docx))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExtractDOCX: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
wantPhrases := []string{
|
||||||
|
"Which keyword declares a variable",
|
||||||
|
"fmt.Println",
|
||||||
|
"n int, err error",
|
||||||
|
}
|
||||||
|
for _, phrase := range wantPhrases {
|
||||||
|
if !strings.Contains(text, phrase) {
|
||||||
|
t.Errorf("text missing %q\nfull text:\n%s", phrase, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Chunking should produce at least 1 chunk.
|
||||||
|
chunks := parse.Chunk(text, 10_000)
|
||||||
|
if len(chunks) == 0 {
|
||||||
|
t.Error("Chunk returned 0 chunks for non-empty text")
|
||||||
|
}
|
||||||
|
t.Logf("extracted %d chars, %d chunk(s)", len(text), len(chunks))
|
||||||
|
}
|
||||||
@@ -0,0 +1,31 @@
|
|||||||
|
package parse
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
// Chunk splits text on double-newlines and builds chunks of at most maxRunes.
|
||||||
|
// A single paragraph longer than maxRunes is kept as its own chunk.
|
||||||
|
func Chunk(text string, maxRunes int) []string {
|
||||||
|
paragraphs := strings.Split(text, "\n\n")
|
||||||
|
var chunks []string
|
||||||
|
var cur strings.Builder
|
||||||
|
|
||||||
|
for _, p := range paragraphs {
|
||||||
|
p = strings.TrimSpace(p)
|
||||||
|
if p == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
pLen := len([]rune(p))
|
||||||
|
if cur.Len() > 0 && len([]rune(cur.String()))+2+pLen > maxRunes {
|
||||||
|
chunks = append(chunks, cur.String())
|
||||||
|
cur.Reset()
|
||||||
|
}
|
||||||
|
if cur.Len() > 0 {
|
||||||
|
cur.WriteString("\n\n")
|
||||||
|
}
|
||||||
|
cur.WriteString(p)
|
||||||
|
}
|
||||||
|
if cur.Len() > 0 {
|
||||||
|
chunks = append(chunks, cur.String())
|
||||||
|
}
|
||||||
|
return chunks
|
||||||
|
}
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
package parse_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"qbank/internal/parse"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestChunk(t *testing.T) {
|
||||||
|
t.Run("small text stays in one chunk", func(t *testing.T) {
|
||||||
|
text := "Para one.\n\nPara two.\n\nPara three."
|
||||||
|
chunks := parse.Chunk(text, 1000)
|
||||||
|
if len(chunks) != 1 {
|
||||||
|
t.Errorf("want 1 chunk, got %d: %v", len(chunks), chunks)
|
||||||
|
}
|
||||||
|
if !strings.Contains(chunks[0], "Para one") || !strings.Contains(chunks[0], "Para three") {
|
||||||
|
t.Errorf("content lost: %q", chunks[0])
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("paragraphs split when over limit", func(t *testing.T) {
|
||||||
|
para := strings.Repeat("x", 600)
|
||||||
|
text := para + "\n\n" + para + "\n\n" + para
|
||||||
|
chunks := parse.Chunk(text, 1000)
|
||||||
|
if len(chunks) < 2 {
|
||||||
|
t.Errorf("want ≥2 chunks for 1800-rune input with 1000 limit, got %d", len(chunks))
|
||||||
|
}
|
||||||
|
// No chunk should combine paragraphs past the limit
|
||||||
|
for i, c := range chunks {
|
||||||
|
if len([]rune(c)) > 1200 {
|
||||||
|
t.Errorf("chunk %d is %d runes, too large", i, len([]rune(c)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("single oversized paragraph kept as own chunk", func(t *testing.T) {
|
||||||
|
bigPara := strings.Repeat("x", 2000)
|
||||||
|
chunks := parse.Chunk(bigPara, 1000)
|
||||||
|
if len(chunks) != 1 {
|
||||||
|
t.Errorf("want 1 chunk for single oversized para, got %d", len(chunks))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("empty paragraphs ignored", func(t *testing.T) {
|
||||||
|
text := "\n\nPara one.\n\n\n\nPara two.\n\n"
|
||||||
|
chunks := parse.Chunk(text, 1000)
|
||||||
|
if len(chunks) != 1 {
|
||||||
|
t.Errorf("want 1 chunk after ignoring blanks, got %d", len(chunks))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
package parse
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/zip"
|
||||||
|
"bytes"
|
||||||
|
"encoding/xml"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ExtractDOCX reads a .docx file and returns its plain text.
|
||||||
|
// DOCX is a ZIP archive; we unzip word/document.xml, walk <w:t> nodes
|
||||||
|
// for text, and emit a newline at each <w:p> boundary.
|
||||||
|
func ExtractDOCX(r io.Reader) (string, error) {
|
||||||
|
data, err := io.ReadAll(r)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("read docx: %w", err)
|
||||||
|
}
|
||||||
|
zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("open docx zip: %w", err)
|
||||||
|
}
|
||||||
|
var docFile *zip.File
|
||||||
|
for _, f := range zr.File {
|
||||||
|
if f.Name == "word/document.xml" {
|
||||||
|
docFile = f
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if docFile == nil {
|
||||||
|
return "", errors.New("word/document.xml not found in docx")
|
||||||
|
}
|
||||||
|
rc, err := docFile.Open()
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("open document.xml: %w", err)
|
||||||
|
}
|
||||||
|
defer rc.Close()
|
||||||
|
return parseDocXML(rc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseDocXML(r io.Reader) (string, error) {
|
||||||
|
dec := xml.NewDecoder(r)
|
||||||
|
var sb strings.Builder
|
||||||
|
var inText bool
|
||||||
|
for {
|
||||||
|
tok, err := dec.Token()
|
||||||
|
if err == io.EOF {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("parse document.xml: %w", err)
|
||||||
|
}
|
||||||
|
switch t := tok.(type) {
|
||||||
|
case xml.StartElement:
|
||||||
|
if t.Name.Local == "t" {
|
||||||
|
inText = true
|
||||||
|
}
|
||||||
|
case xml.EndElement:
|
||||||
|
if t.Name.Local == "t" {
|
||||||
|
inText = false
|
||||||
|
}
|
||||||
|
if t.Name.Local == "p" {
|
||||||
|
sb.WriteByte('\n')
|
||||||
|
}
|
||||||
|
case xml.CharData:
|
||||||
|
if inText {
|
||||||
|
sb.Write([]byte(t))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(sb.String()), nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
package parse_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/zip"
|
||||||
|
"bytes"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"qbank/internal/parse"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestExtractDOCX(t *testing.T) {
|
||||||
|
const docXML = `<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||||
|
<w:body>
|
||||||
|
<w:p><w:r><w:t>Question 1: What is Go?</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>A) A compiled language</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>B) An interpreted language</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>C) A markup language</w:t></w:r></w:p>
|
||||||
|
</w:body>
|
||||||
|
</w:document>`
|
||||||
|
|
||||||
|
docx := buildDocx(t, docXML)
|
||||||
|
|
||||||
|
text, err := parse.ExtractDOCX(bytes.NewReader(docx))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExtractDOCX: %v", err)
|
||||||
|
}
|
||||||
|
for _, want := range []string{"Question 1", "compiled language", "interpreted language"} {
|
||||||
|
if !strings.Contains(text, want) {
|
||||||
|
t.Errorf("output missing %q; got:\n%s", want, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractDOCX_MissingXML(t *testing.T) {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
w := zip.NewWriter(&buf)
|
||||||
|
w.Close()
|
||||||
|
|
||||||
|
_, err := parse.ExtractDOCX(bytes.NewReader(buf.Bytes()))
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error for docx without document.xml")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildDocx(t *testing.T, xmlContent string) []byte {
|
||||||
|
t.Helper()
|
||||||
|
var buf bytes.Buffer
|
||||||
|
w := zip.NewWriter(&buf)
|
||||||
|
f, err := w.Create("word/document.xml")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if _, err := f.Write([]byte(xmlContent)); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := w.Close(); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
package parse
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"strings"
|
||||||
|
"unicode"
|
||||||
|
|
||||||
|
"github.com/ledongthuc/pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ErrScanPDF is returned when extracted text is empty or non-textual,
|
||||||
|
// indicating a scan-based (image-only) PDF that cannot be parsed.
|
||||||
|
var ErrScanPDF = errors.New("scan-based PDF: please convert to text first")
|
||||||
|
|
||||||
|
// ExtractPDF reads a PDF and returns its concatenated plain text.
|
||||||
|
// Returns ErrScanPDF if the content appears to be empty or non-textual.
|
||||||
|
func ExtractPDF(r io.Reader) (string, error) {
|
||||||
|
data, err := io.ReadAll(r)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("read pdf: %w", err)
|
||||||
|
}
|
||||||
|
reader, err := pdf.NewReader(bytes.NewReader(data), int64(len(data)))
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("parse pdf: %w", err)
|
||||||
|
}
|
||||||
|
var sb strings.Builder
|
||||||
|
for i := 1; i <= reader.NumPage(); i++ {
|
||||||
|
page := reader.Page(i)
|
||||||
|
if page.V.IsNull() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
text, err := page.GetPlainText(nil)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sb.WriteString(text)
|
||||||
|
sb.WriteByte('\n')
|
||||||
|
}
|
||||||
|
text := sb.String()
|
||||||
|
if isGibberish(text) {
|
||||||
|
return "", ErrScanPDF
|
||||||
|
}
|
||||||
|
return text, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// isGibberish returns true when text is too short or has < 2% alphanumeric content.
|
||||||
|
func isGibberish(text string) bool {
|
||||||
|
runes := []rune(text)
|
||||||
|
if len(runes) < 50 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
var alpha int
|
||||||
|
for _, c := range runes {
|
||||||
|
if unicode.IsLetter(c) || unicode.IsDigit(c) {
|
||||||
|
alpha++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return float64(alpha)/float64(len(runes)) < 0.02
|
||||||
|
}
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
package parse
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestIsGibberish(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
text string
|
||||||
|
want bool
|
||||||
|
}{
|
||||||
|
{"empty", "", true},
|
||||||
|
{"too short", "hello", true},
|
||||||
|
{"exactly 50 letters", strings.Repeat("a", 50), false},
|
||||||
|
{"49 letters", strings.Repeat("a", 49), true},
|
||||||
|
{"all punctuation", strings.Repeat(".", 100), true},
|
||||||
|
{"1% alpha", strings.Repeat(".", 99) + "a", true},
|
||||||
|
{"2% alpha exactly", strings.Repeat(".", 49) + "a" + strings.Repeat(".", 49) + "a", false},
|
||||||
|
{"normal text", "The quick brown fox jumps over the lazy dog. " + strings.Repeat("word ", 10), false},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := isGibberish(tt.text); got != tt.want {
|
||||||
|
t.Errorf("isGibberish(%q…) = %v, want %v", tt.text[:min(len(tt.text), 20)], got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func min(a, b int) int {
|
||||||
|
if a < b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user