Files
2026-05-11 13:03:04 +03:00

63 lines
1.4 KiB
Go

package parse
import (
"bytes"
"errors"
"fmt"
"io"
"strings"
"unicode"
"github.com/ledongthuc/pdf"
)
// ErrScanPDF is returned when extracted text is empty or non-textual,
// indicating a scan-based (image-only) PDF that cannot be parsed.
var ErrScanPDF = errors.New("scan-based PDF: please convert to text first")
// ExtractPDF reads a PDF and returns its concatenated plain text.
// Returns ErrScanPDF if the content appears to be empty or non-textual.
func ExtractPDF(r io.Reader) (string, error) {
data, err := io.ReadAll(r)
if err != nil {
return "", fmt.Errorf("read pdf: %w", err)
}
reader, err := pdf.NewReader(bytes.NewReader(data), int64(len(data)))
if err != nil {
return "", fmt.Errorf("parse pdf: %w", err)
}
var sb strings.Builder
for i := 1; i <= reader.NumPage(); i++ {
page := reader.Page(i)
if page.V.IsNull() {
continue
}
text, err := page.GetPlainText(nil)
if err != nil {
continue
}
sb.WriteString(text)
sb.WriteByte('\n')
}
text := sb.String()
if isGibberish(text) {
return "", ErrScanPDF
}
return text, nil
}
// isGibberish returns true when text is too short or has < 2% alphanumeric content.
func isGibberish(text string) bool {
runes := []rune(text)
if len(runes) < 50 {
return true
}
var alpha int
for _, c := range runes {
if unicode.IsLetter(c) || unicode.IsDigit(c) {
alpha++
}
}
return float64(alpha)/float64(len(runes)) < 0.02
}