Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 13:03:04 +03:00
parent d9de37d3d8
commit e53e7662e9
13 changed files with 628 additions and 0 deletions
@@ -0,0 +1,62 @@
+package parse
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"strings"
+	"unicode"
+
+	"github.com/ledongthuc/pdf"
+)
+
+// ErrScanPDF is returned when extracted text is empty or non-textual,
+// indicating a scan-based (image-only) PDF that cannot be parsed.
+var ErrScanPDF = errors.New("scan-based PDF: please convert to text first")
+
+// ExtractPDF reads a PDF and returns its concatenated plain text.
+// Returns ErrScanPDF if the content appears to be empty or non-textual.
+func ExtractPDF(r io.Reader) (string, error) {
+	data, err := io.ReadAll(r)
+	if err != nil {
+		return "", fmt.Errorf("read pdf: %w", err)
+	}
+	reader, err := pdf.NewReader(bytes.NewReader(data), int64(len(data)))
+	if err != nil {
+		return "", fmt.Errorf("parse pdf: %w", err)
+	}
+	var sb strings.Builder
+	for i := 1; i <= reader.NumPage(); i++ {
+		page := reader.Page(i)
+		if page.V.IsNull() {
+			continue
+		}
+		text, err := page.GetPlainText(nil)
+		if err != nil {
+			continue
+		}
+		sb.WriteString(text)
+		sb.WriteByte('\n')
+	}
+	text := sb.String()
+	if isGibberish(text) {
+		return "", ErrScanPDF
+	}
+	return text, nil
+}
+
+// isGibberish returns true when text is too short or has < 2% alphanumeric content.
+func isGibberish(text string) bool {
+	runes := []rune(text)
+	if len(runes) < 50 {
+		return true
+	}
+	var alpha int
+	for _, c := range runes {
+		if unicode.IsLetter(c) || unicode.IsDigit(c) {
+			alpha++
+		}
+	}
+	return float64(alpha)/float64(len(runes)) < 0.02
+}