qbank/internal/parse/pdf.go

package parse

import (
	"bytes"
	"errors"
	"fmt"
	"io"
	"strings"
	"unicode"

	"github.com/ledongthuc/pdf"
)

// ErrScanPDF is returned when extracted text is empty or non-textual,
// indicating a scan-based (image-only) PDF that cannot be parsed.
var ErrScanPDF = errors.New("scan-based PDF: please convert to text first")

// ExtractPDF reads a PDF and returns its concatenated plain text.
// Returns ErrScanPDF if the content appears to be empty or non-textual.
func ExtractPDF(r io.Reader) (string, error) {
	data, err := io.ReadAll(r)
	if err != nil {
		return "", fmt.Errorf("read pdf: %w", err)
	}
	reader, err := pdf.NewReader(bytes.NewReader(data), int64(len(data)))
	if err != nil {
		return "", fmt.Errorf("parse pdf: %w", err)
	}
	var sb strings.Builder
	for i := 1; i <= reader.NumPage(); i++ {
		page := reader.Page(i)
		if page.V.IsNull() {
			continue
		}
		text, err := page.GetPlainText(nil)
		if err != nil {
			continue
		}
		sb.WriteString(text)
		sb.WriteByte('\n')
	}
	text := sb.String()
	if isGibberish(text) {
		return "", ErrScanPDF
	}
	return text, nil
}

// isGibberish returns true when text is too short or has < 2% alphanumeric content.
func isGibberish(text string) bool {
	runes := []rune(text)
	if len(runes) < 50 {
		return true
	}
	var alpha int
	for _, c := range runes {
		if unicode.IsLetter(c) || unicode.IsDigit(c) {
			alpha++
		}
	}
	return float64(alpha)/float64(len(runes)) < 0.02
}