Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Jānis Kacēns
2026-05-11 13:03:04 +03:00
parent d9de37d3d8
commit e53e7662e9
13 changed files with 628 additions and 0 deletions
+119
View File
@@ -0,0 +1,119 @@
package llm
import (
"context"
"crypto/sha256"
"encoding/json"
"fmt"
openai "github.com/sashabaranov/go-openai"
)
// ParsedQuestion is a question extracted from a document chunk by the LLM.
type ParsedQuestion struct {
Question string
Answers []ParsedAnswer
}
// ParsedAnswer is one answer choice for a ParsedQuestion.
type ParsedAnswer struct {
Text string
Correct bool
}
// ChatClient is the interface for creating chat completions.
// The concrete *openai.Client satisfies this interface.
type ChatClient interface {
CreateChatCompletion(ctx context.Context, req openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error)
}
// Client wraps a ChatClient with question-extraction logic.
type Client struct {
cc ChatClient
model string
}
// New creates a Client backed by the real OpenAI API.
func New(apiKey, model string) *Client {
if model == "" {
model = "gpt-4o-mini"
}
return &Client{cc: openai.NewClient(apiKey), model: model}
}
// NewWithClient creates a Client with an injected ChatClient (useful for tests).
func NewWithClient(cc ChatClient, model string) *Client {
return &Client{cc: cc, model: model}
}
const systemPrompt = `You extract multiple-choice questions from study material. Return every question found. Exactly one answer per question must be marked correct. If the source doesn't clearly mark a correct answer, omit that question entirely. Do not invent questions not present in the text.
Respond with JSON matching this schema exactly:
{"questions":[{"question":"<text>","answers":[{"text":"<text>","correct":false},{"text":"<text>","correct":true}]}]}`
type llmResponse struct {
Questions []struct {
Question string `json:"question"`
Answers []struct {
Text string `json:"text"`
Correct bool `json:"correct"`
} `json:"answers"`
} `json:"questions"`
}
// ExtractQuestions sends chunk to the LLM and returns validated, deduplicated questions.
// Questions that do not have exactly one correct answer are silently dropped.
func (c *Client) ExtractQuestions(ctx context.Context, chunk string) ([]ParsedQuestion, error) {
resp, err := c.cc.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
Model: c.model,
Messages: []openai.ChatCompletionMessage{
{Role: openai.ChatMessageRoleSystem, Content: systemPrompt},
{Role: openai.ChatMessageRoleUser, Content: chunk},
},
ResponseFormat: &openai.ChatCompletionResponseFormat{
Type: openai.ChatCompletionResponseFormatTypeJSONObject,
},
})
if err != nil {
return nil, fmt.Errorf("openai: %w", err)
}
if len(resp.Choices) == 0 {
return nil, fmt.Errorf("openai: empty response")
}
var raw llmResponse
if err := json.Unmarshal([]byte(resp.Choices[0].Message.Content), &raw); err != nil {
return nil, fmt.Errorf("parse llm response: %w", err)
}
seen := make(map[string]bool)
var out []ParsedQuestion
for _, q := range raw.Questions {
var nCorrect int
for _, a := range q.Answers {
if a.Correct {
nCorrect++
}
}
if nCorrect != 1 {
continue
}
key := textHash(q.Question)
if seen[key] {
continue
}
seen[key] = true
pq := ParsedQuestion{Question: q.Question}
for _, a := range q.Answers {
pq.Answers = append(pq.Answers, ParsedAnswer{Text: a.Text, Correct: a.Correct})
}
out = append(out, pq)
}
return out, nil
}
func textHash(s string) string {
h := sha256.Sum256([]byte(s))
return fmt.Sprintf("%x", h[:8])
}